#!/usr/bin/env bash # # pat2pdf version 1.32 # Copyright (c)2000 Oren Tirosh # Copyright (c)2002 Thomas Boege # Released under the GPL # # This script connects to the USPTO patent database, retrieves the TIFF # patent images and converts them into a single pdf file using libtiff. # # Yes, I know, you would have written it in perl/python/(insert your # favourite scripting language here) # # It requires an http fetcher (lynx by default), tiffcp and tiff2pdf # (both part of libtiff) # #Usage: # pat2pdf | # # Result is a file in the current directory named pat.pdf # #Bugs: # # Error checking and recovery could be better. # #Homepage: # http://www.tothink.com/pat2pdf # # Some utility functions: # Nonzero if first string contains second string (+globbing chars ?*[x-y]) contains() { [ -n "$1" -a -z "${1##*$2*}" ] } # Change this if you prefer wget, curl, etc. Don't forget to also update # the calls to verify the presence of the executable. url2stdout() { lynx -dump -source "$1" 2>/dev/null # wget -Y off -t 13 -O - $1 2>/dev/null } # Write to stderr werr() { echo "$*" 1>&2 } # Die with a message die() { werr "$*" exit 1 } # extract a field from a string and echo to stdout # $1 - source string # $2 - before target field # $3 - prefix of target field # $4 - after target field extract() { [ -n "$1" ] || return 1; [ -z "${1##*$2$3*$4*}" ] || return 1; STRIP="$3${1##*$2$3}"; STRIP="${STRIP%%$4*}"; echo "$STRIP" } # verify the presence of a required executable verify() { [ -x "$(which $1)" ] || die "Error: required executable $1 not found" } pat_issued () { werr "...fetching search results page for patent $PATNUM" SITENAME="http://patft.uspto.gov" RESULTPAGE=$( url2stdout "${SITENAME}/netacgi/nph-Parser?patentnumber=${PATNUM}" ) || die "Error fetching search results web page." contains "$RESULTPAGE" "No patents have matched" && die "No patents have matched your query." PATENTURL=$( extract "$RESULTPAGE" "URL=" "/netacgi/nph-Parse" "\">" ) || die "Error isolating URL from results page." PATENTURL="${SITENAME}${PATENTURL}" werr "...fetching patent page" PATENTPAGE=$( url2stdout "$PATENTURL" | head -n 70 ) || die "Error fetching patent web page." TITLE=$( extract "$PATENTPAGE" "font size=\"+1\">" "" "" ) || TITLE="##Error isolating patent title - continuing anyway##" echo "U.S. Patent $PATNUM: $TITLE" IMAGEURL=$( extract "$PATENTPAGE" "a href=" "http://patimg" ">" ) || die "Error isolating image page URL from patent page." IMAGESERVER=$( extract "$IMAGEURL" "" "http://patimg" "/.piw" ) || die "Error isolating image server name." } pat_appl () { SITENAME="http://appft1.uspto.gov" werr "...fetching search results page for patent application $PATNUM" RESULTURL="${SITENAME}/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.html&r=0&p=1&f=S&l=50&Query=DN%2F$PATNUM&d=PG01" RESULTPAGE=$( url2stdout $RESULTURL )|| die "Error fetching search results web page." contains "$RESULTPAGE" "No patents have matched" && die "No patents have matched your query." PATENTURL=$( extract "$RESULTPAGE" "HREF=" "/netacgi/nph-Parse" ">" ) || die "Error isolating URL from results page." PATENTURL="${SITENAME}${PATENTURL}" werr "...fetching patent page" PATENTPAGE=$( url2stdout "$PATENTURL" |head -n 70 ) || die "Error fetching patent web page." TITLE=$( extract "$PATENTPAGE" "font size=\"+1\">" "" "" ) || TITLE="##Error isolating patent title - continuing anyway##" echo "U.S. Patent Application $PATNUM: $TITLE" IMAGEURL=$( extract "$PATENTPAGE" "a href=" "http://aiw" ">" ) || die "Error isolating image page URL from patent page." IMAGESERVER=$( extract "$IMAGEURL" "" "http://aiw" "/.aiw?" ) || die "Error isolating image server name." } #main() verify "lynx" #verify "wget" verify "tiffcp" verify "tiff2pdf" verify "sed" verify "head" verify "mv" verify "rm" PATNUM=$( echo $1 | sed 's@,@@g' ) [ -z "$PATNUM" ] && die "usage: pat2pdf |" [ -z "${PATNUM##[a-zA-Z1-9][a-zA-Z0-9][0-9][0-9]*}" ] || die "Use a 7 digit patent number." [ ${#PATNUM} -gt 11 ] && die "Too big number" [ ${#PATNUM} -eq 11 ] && pat_appl [ ${#PATNUM} -le 7 ] && pat_issued werr "...fetching images page" IMAGEPAGE=$( url2stdout "$IMAGEURL" ) || die "Error fetching images page." NUMPAGES=$( extract "$IMAGEPAGE" "-- NumPages=" "" " --" ) || die "Error getting number of pages." #werr $IMAGEPAGE TIFFURL=$( extract "$IMAGEPAGE" "embed src=?" "/.DImg" "? width=" ) || die "Error getting TIFF file URL." TIFFURL="$IMAGESERVER$TIFFURL" contains "$TIFFURL" "PageNum=1" || die "Error processing TIFF file URL" TIFFURL1="${TIFFURL%%PageNum=1*}PageNum=" TIFFURL2="${TIFFURL##*IDKey=}" IDKEY="${TIFFURL2%%?ImgFormat*}" TIFFURL2="&ImgFormat=tif&IDKey=${IDKEY}" PAGEFILES="" PAGE=1 { while [ "$PAGE" -le "$NUMPAGES" ] ; do werr "...fetching page $PAGE of $NUMPAGES" url2stdout "${TIFFURL1}${PAGE}${TIFFURL2}" > "tmppat${PATNUM}-${PAGE}.tiff" || die "Error retrieving TIFF page." PAGEFILES="${PAGEFILES} tmppat${PATNUM}-${PAGE}.tiff" PAGE=$[$PAGE+1] done } tiffcp $PAGEFILES tmppat${PATNUM}-all.tiff &>/dev/null || die "Error combining images with tiffcp" tiff2pdf -p a4 -o pat${PATNUM}.pdf tmppat${PATNUM}-all.tiff &>/dev/null || die "Error generating pdf with tiff2pdf" rm -f "tmppat${PATNUM}-all.tiff" rm -f $PAGEFILES werr Done. werr pat${PATNUM}.pdf