#!/usr/bin/env python # # coool: Checks OpenOffice.Org Links # Reports broken hyperlinks in documents # # Updates and more information available # on http://free-electrons.com/community/tools/utils/coool # # Version 1.1 # Copyright (C) 2005-2007 Free Electrons # Author: Michael Opdenacker # Home page: http://free-electrons.com/community/tools/utils/coool # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN # NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 675 Mass Ave, Cambridge, MA 02139, USA. import sys, os, zipfile, xml.parsers.expat, urllib2, urllib, urlparse, httplib, thread, time, configparse, string lock = thread.allocate_lock() host_lock = thread.allocate_lock() ########################################################## # Common routines ########################################################## def debug (s): # Display debug information if debug mode is set global options if options.debug: print '[DEBUG] ' + s def repeat_string(s, number): # Returns a string containing number times the given string str = '' for i in range(number): str += s return str def url_failure(url, why): global link_text, link_type, link_xml_file, input_file, found_errors # Report the file name the first time # an error is found on the current file if not found_errors: found_errors = True print input_file print '-' * len(input_file) print '[ERROR] URL failure: ' + url print ' Reason:', why num_links = len(link_type[url]) if num_links > 1: print ' ' + str(num_links) + ' links to this URL:' for i in range(num_links): print ' ' + string.capwords(link_type[url][i]) + ' link: "'+ link_text[url][i] + '"' if link_xml_file[url][i] == 'styles.xml': print ' (found in style data: header, footer, master page...)' def check_http_url(url): request=urllib2.Request(url) opener = urllib2.build_opener() # Add a browser-like User-Agent. Some sites (like wikipedia) don't seem to accept requests # with the default urllib2 User-Agent request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv \u2014 1.7.8) Gecko/20050511') try: opener.open(request).read() except urllib2.HTTPError, why: url_failure(url, why) except urllib2.URLError, why: #Doesn't work in all cases #url_failure(url, why.__getitem__(0)[1]) url_failure(url, why) except httplib.BadStatusLine, why: url_failure(url, why) def check_ftp_url(url): # urllib2 doesn't raise any exception when a # ftp url for an invalid file is given # Only invalid domains are reported # That's why we are using urllib.urlretrieve try: tmpfile = urllib.urlretrieve(url)[0] except IOError, why: #Doesn't work in all cases #url_failure(url, why.__getitem__(1)[1]) url_failure(url, why) else: # With a non-existing file on a ftp URL, # we get a zero size output file # Even if a real zero size file exists, # it's worth highlighting anyway # (no point in making an hyperlink to it) if os.path.getsize(tmpfile) == 0: url_failure(url, 'Non existing or empty file') os.remove(tmpfile) def check_internal_link(url): # Checks whether the internal link corresponds # to a correct reference. global internal_targets target = url[1:] # Remove a trailing ' (Notes)' substring if any # This assumes that if the page exists, its notes page # exists too. if target.endswith(' (Notes)'): target = target[:-8] if target not in internal_targets: url_failure(url, 'No such target within document') def check_url(url): debug('Checking link: ' + url) protocol = urlparse.urlparse(url)[0] if url[0] == '#': check_internal_link(url) elif protocol == 'http' or protocol == 'https': check_http_url(url) elif protocol == 'ftp': check_ftp_url(url) else: # Still try to handle other protocols try: urllib2.urlopen(url) except: url_failure(url, 'Unknown - Please report this to the developers of this script!') def get_hostname(url): return urlparse.urlparse(url)[1] def url_is_ignored(url): global options hostname = get_hostname(url) return (options.exclude_hosts.count(hostname)) def check_url_threaded(url): global lock, host_lock, thread_count, tokens_per_host, options # Check that we don't run too many parallel checks per host # That could bring the host down or at least be considered # as the Denial of Service attack. hostname = get_hostname(url) if options.verbose: print '[INFO] Checking hyperlink:', url # Counting parallel requests to the same host. # Of course, ignore local links if hostname != '': wait = True host_lock.acquire() if not tokens_per_host.has_key(hostname): tokens_per_host[hostname] = options.max_requests_per_host host_lock.release() while wait: host_lock.acquire() if tokens_per_host[hostname] > 0: tokens_per_host[hostname] -= 1 host_lock.release() wait = False else: host_lock.release() time.sleep(1) # Do the URL check! check_url(url) # The thread is about to complete, decrement the counters lock.acquire() thread_count -= 1 lock.release() if hostname != '': host_lock.acquire() tokens_per_host[hostname] += 1 host_lock.release() def xml_start_element(name, attributes): # Called by the xml parser on each xml start tag global link_text, link_type, link_xml_file, xml_file, current_url, inside_link, internal_targets, inside_frame, current_frame global inside_heading, heading_level # Process hyperlinks if attributes.has_key('xlink:href') and (name == 'text:a' or name == 'form:button' or name == 'draw:a'): inside_link = True current_url = attributes['xlink:href'] if not link_text.has_key(current_url): link_text[current_url], link_type[current_url], link_xml_file[current_url] = [], [], [] link_xml_file[current_url].append(xml_file) if name == 'text:a': link_type[current_url].append('text') elif name == 'form:button': link_text[current_url].append(attributes['form:label']) link_type[current_url].append('button') elif name == 'draw:a': link_text[current_url].append(attributes['office:name']) link_type[current_url].append('image') # Presentation: record page names, to which internal links can be made elif name == 'draw:page': internal_targets.add(attributes['draw:name']) # Text documents: things to which internal links can be made elif name == 'text:bookmark': internal_targets.add(attributes['text:name']) elif name == 'text:section': internal_targets.add(attributes['text:name'] + '|region') elif name == 'draw:frame' and attributes.has_key('draw:name'): current_frame = attributes['draw:name'] inside_frame = True elif name == 'table:table': internal_targets.add(attributes['table:name']+ '|table') elif name == 'text:h': inside_heading = True heading_level = int(attributes['text:outline-level']) elif inside_frame: if name == 'draw:image': internal_targets.add(current_frame + '|graphic') elif name == 'draw:text-box': internal_targets.add(current_frame + '|frame') elif name == 'draw:object': internal_targets.add(current_frame + '|ole') def xml_char_data(data): global current_url, current_element, link_type, link_text global inside_heading, heading_level, internal_targets if inside_link: if link_type[current_url][-1] == 'text': # Truncate link text if too long if len(data) > 77: data = data[:77] + '...' link_text[current_url].append(data) if inside_heading: internal_targets.add(repeat_string('0.', heading_level,) + data + '|outline') def xml_end_element(name): global inside_link, inside_frame, inside_heading if inside_link and (name == 'text:a' or name == 'form:button' or name == 'draw:a'): inside_link = False elif inside_frame and (name == 'draw:frame'): inside_frame = False elif inside_heading and (name == 'text:h'): inside_heading = False ########################################################## # Main program ########################################################## # Command parameters # Either default values, found in a configuration file # or on the command line # usage = 'usage: %prog [options] [OpenOffice.org document files]' description = 'Checks OpenOffice.org documents for broken Links' optparser = configparse.OptionParser(usage=usage, version='coool 1.0', description=description) optparser.add_option('-v', '--verbose', config='true', action='store_true', dest='verbose', default=False, help='display progress information') optparser.add_option('-d', '--debug', config='true', action='store_true', dest='debug', default=False, help='display debug information') optparser.add_option('-t', '--max-threads', config='true', action='store', type='int', dest='max_threads', default=100, help='set the maximum number of parallel threads to create') optparser.add_option('-r', '--max-requests-per-host', config='true', action='store', type='int', dest='max_requests_per_host', default=5, help='set the maximum number of parallel requests per host') optparser.add_option('-x', '--exclude-hosts', config='true', action='store', type='string', dest='exclude_hosts', default='', help='ignore urls which host name belongs to the given list') (options, args) = optparser.parse_args(files=[os.path.expanduser("~/.cooolrc")]) if len(args) == 0: print 'No files to check. Exiting.' sys.exit() # Turn options.exclude_hosts into a list, for exact matching options.exclude_hosts = options.exclude_hosts.split() # Iterate on all given documents for input_file in args: if options.verbose: print '[INFO] Checking links in file', input_file, '...' link_text = dict() link_type = dict() link_xml_file = dict() internal_targets = set() thread_count = 0 tokens_per_host = dict() # Unzip the OpenOffice (Open Document Format) document try: zip = zipfile.ZipFile(input_file, 'r') except IOError, (errno, why): print ' ERROR: Cannot open file', input_file, ':', why sys.exit() except zipfile.BadZipfile: print ' ERROR: not an OpenOffice.org Open Document Format document:', input_file sys.exit() # Parse xml files and record urls global xml_file for xml_file in ['styles.xml', 'content.xml']: try: xml_contents = zip.read(xml_file) except KeyError: print ' ERROR: not an OpenOffice.org Open Document Format document:', input_file sys.exit() inside_link, inside_frame, inside_heading = False, False, False parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = xml_start_element parser.CharacterDataHandler = xml_char_data parser.EndElementHandler = xml_end_element parser.Parse(xml_contents) zip.close() # Run URL checks found_errors = False for url in link_type.keys(): if not url_is_ignored(url): # Do not exceed the max number of threads while thread_count > options.max_threads: time.sleep(1) # Keep track of the number of pending URL checks # It will be up to the checking thread to decrease thread_count lock.acquire() thread_count += 1 lock.release() args = url, thread.start_new_thread(check_url_threaded, args) # Wait for all threads to complete while thread_count > 0: if options.verbose: print '[INFO] Waiting for URL checks to complete:', thread_count, 'threads left' time.sleep(1) if options.verbose: print '[INFO] Hyperlink checking complete.'