#!/usr/bin/python import os, sys, urllib2, re, urlparse def pg2tiff(url, patnum, pgnum): pgurl = re.sub(r'PageNum=\d*', 'PageNum=' + `pgnum`, url) print pgurl f = urllib2.urlopen(pgurl) tifffn = '/tmp/' + patnum + '_' + `pgnum` + '.tif' fo = open(tifffn, 'w') while 1: b = f.read(4096) if b == '' or b == None: break fo.write(b) fo.close() return tifffn def pg2ps(url, patnum, pgnum): tifffn = pg2tiff(url, patnum, pgnum) psfn = '/tmp/' + patnum + '.ps' if pgnum == 1: redir = ' > ' else: redir = ' >> ' os.system('tiff2ps -2 ' + tifffn + redir + psfn) return psfn def pat2pdf(patnum): url = 'http://patft.uspto.gov/netacgi/nph-Parser?TERM1=' + patnum + '&Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2Fsrchnum.htm&r=0&f=S&l=50' f = urllib2.urlopen(url) refre = re.compile(r'HTTP-EQUIV.*URL=([^\"]*)"') newurl = None for l in f.readlines(): m = refre.search(l) if m: newurl = m.group(1) if newurl: url = urlparse.urljoin(url, newurl) print url f = urllib2.urlopen(url) imgre = re.compile(r'a href=(\"?http://patimg[^\"]*\"?)') imgurl = None for l in f.readlines(): m = imgre.search(l) if m: imgurl = m.group(1).strip('"') print imgurl if imgurl: f = urllib2.urlopen(imgurl) pagere = re.compile(r'1 +of +(\d+) +page') embre = re.compile(r'embed src=(\"?[^\"]*\")?') emburl = None for l in f.readlines(): m = pagere.search(l) if m: n_pages = int(m.group(1)) m = embre.search(l) if m: emburl = m.group(1).strip('"') if emburl: url = urlparse.urljoin(imgurl, emburl) for i in range(1, n_pages + 1): psfn = pg2ps(url, patnum, i) os.system('ps2pdf ' + psfn) pat2pdf(sys.argv[1])