ALT Linux Sisyphus

#!/usr/bin/env python
#
# coool: Checks OpenOffice.Org Links
# Reports broken hyperlinks in documents
#
# Updates and more information available
# on http://free-electrons.com/community/tools/utils/coool
#
# Version 1.0 beta1
# Copyright (C) 2005 Free Electrons
# Author: Michael Opdenacker <michael@free-electrons.com>
# Home page: http://free-electrons.com/community/tools/utils/coool
# 
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# 
# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
# NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# 
# You should have received a copy of the  GNU General Public License along
# with this program; if not, write  to the Free Software Foundation, Inc.,
# 675 Mass Ave, Cambridge, MA 02139, USA.

import sys, os, zipfile, xml.parsers.expat, urllib2, urllib, urlparse, httplib, thread, time, optparse, string

lock = thread.allocate_lock()

##########################################################
# Common routines
##########################################################

def repeat_string(s, number):

    # Returns a string containing number times the given string

    str = ''

    for i in range(number):
        str += s

    return str

def url_failure(url, why):

    global link_text, link_type
    print '   URL failure: ' + url
    print '        Reason:', why 

    num_links = len(link_type[url])

    if num_links > 1:
        print '   ' + str(num_links) + ' links to this URL:'

    for i in range(num_links):
        print '        ' + string.capwords(link_type[url][i]) + ' link: "'+ link_text[url][i] + '"'

def check_http_url(url):
    request=urllib2.Request(url)
    opener = urllib2.build_opener()
    # Add a browser-like User-Agent. Some sites (like wikipedia) don't seem to accept requests
    # with the default urllib2 User-Agent
    request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv \u2014 1.7.8) Gecko/20050511')

    try:
        opener.open(request).read()
    except urllib2.HTTPError, why:
        url_failure(url, why)
    except urllib2.URLError, why:
        #Doesn't work in all cases
        #url_failure(url, why.__getitem__(0)[1])
        url_failure(url, why)
    except httplib.BadStatusLine, why:
        url_failure(url, why)

def check_ftp_url(url):
    # urllib2 doesn't raise any exception when a
    # ftp url for an invalid file is given
    # Only invalid domains are reported
    # That's why we are using urllib.urlretrieve

    try:
        tmpfile = urllib.urlretrieve(url)[0]
    except IOError, why:
        #Doesn't work in all cases
        #url_failure(url, why.__getitem__(1)[1])
        url_failure(url, why)
    else:
        # With a non-existing file on a ftp URL,
	# we get a zero size output file
	# Even if a real zero size file exists,
	# it's worth highlighting anyway
	# (no point in making an hyperlink to it)	
 	if os.path.getsize(tmpfile) == 0:
 	   url_failure(url, 'Non existing or empty file')
	os.remove(tmpfile)

def check_internal_link(url):
    # Checks whether the internal link corresponds
    # to a correct reference.

    global internal_targets

    target = url[1:]

    # Remove a trailing ' (Notes)' substring if any
    # This assumes that if the page exists, its notes page
    # exists too.

    if target.endswith(' (Notes)'):
       target = target[:-8]

    if target not in internal_targets:
       url_failure(url, 'No such target within document')

def check_url(url):

    protocol = urlparse.urlparse(url)[0]

    if url[0] == '#':
       check_internal_link(url)
    elif protocol == 'http' or protocol == 'https':
       check_http_url(url)
    elif protocol == 'ftp':
       check_ftp_url(url)
    else:
       # Still try to handle other protocols
       try:
           urllib2.urlopen(url)
       except:
           url_failure(url, 'Unknown - Please report this to the developers of this script!') 

def check_url_threaded(url):

    global lock, thread_count

    check_url(url)

    # The thread is about to complete, decrement the counter
    lock.acquire()
    thread_count -= 1
    lock.release()

def xml_start_element(name, attributes):

    # Called by the xml parser on each xml start tag

    global link_text, link_type, current_url, inside_link, internal_targets, inside_frame, current_frame
    global inside_heading, heading_level

    # Process hyperlinks

    if attributes.has_key('xlink:href') and (name == 'text:a' or name == 'form:button' or name == 'draw:a'):

       inside_link = True
       current_url = attributes['xlink:href']

       if not link_text.has_key(current_url):
          link_text[current_url], link_type[current_url] = [], []

       if name == 'text:a':
          link_type[current_url].append('text')
       elif name == 'form:button':
          link_text[current_url].append(attributes['form:label'])
	  link_type[current_url].append('button')
       elif name == 'draw:a':
          link_text[current_url].append(attributes['office:name'])
          link_type[current_url].append('image')

    # Presentation: record page names, to which internal links can be made

    elif name == 'draw:page':
       internal_targets.add(attributes['draw:name'])

    # Text documents: things to which internal links can be made

    elif name == 'text:bookmark':
       internal_targets.add(attributes['text:name'])
    elif name == 'text:section':
       internal_targets.add(attributes['text:name'] + '|region')
    elif name == 'draw:frame' and attributes.has_key('draw:name'):
       current_frame = attributes['draw:name']
       inside_frame = True
    elif name == 'table:table':
       internal_targets.add(attributes['table:name']+ '|table')
    elif name == 'text:h':
       inside_heading = True
       heading_level = int(attributes['text:outline-level'])      
    elif inside_frame:
         if name == 'draw:image':
            internal_targets.add(current_frame + '|graphic')
	 elif name == 'draw:text-box':
            internal_targets.add(current_frame + '|frame')
	 elif name == 'draw:object':
            internal_targets.add(current_frame + '|ole')

def xml_char_data(data):

    global current_url, current_element, link_type, link_text
    global inside_heading, heading_level, internal_targets

    if inside_link:    
       if link_type[current_url][-1] == 'text':

       	  # Truncate link text if too long

          if len(data) > 77:
	     data = data[:77] +  '...'

          link_text[current_url].append(data)

    if inside_heading:
       internal_targets.add(repeat_string('0.', heading_level,) + data + '|outline')            

def xml_end_element(name):

    global inside_link, inside_frame, inside_heading

    if inside_link and (name == 'text:a' or name == 'form:button' or name == 'draw:a'):
       inside_link = False
    elif inside_frame and (name == 'draw:frame'):
       inside_frame = False
    elif inside_heading and (name == 'text:h'):
       inside_heading = False

##########################################################
# Main program
##########################################################

# Define and check command line arguments

usage = 'usage: %prog [options] [OpenOffice.org document files]'
description = 'Checks OpenOffice.org documents for broken Links'

optparser = optparse.OptionParser(usage=usage, version='coool 1.0beta1', description=description)

optparser.add_option('-v', '--verbose',
                     action='store_true', dest='verbose', default=False,
                     help='display progress information')
(options, args) = optparser.parse_args()

if len(args) == 0:
   print 'No files to check. Exiting.'
   sys.exit()

# Iterate on all given documents

for input_file in args:

    print 'Checking links in file', input_file, '...'
    link_text = dict()
    link_type = dict()
    internal_targets = set()
    thread_count = 0

    # Unzip the OpenOffice (Open Document Format) document

    try:
    	zip = zipfile.ZipFile(input_file, 'r')
    except IOError, (errno, why):
    	print '   ERROR: Cannot open file', input_file, ':', why
    	sys.exit()
    except zipfile.BadZipfile:
    	print '   ERROR: not an OpenOffice.org Open Document Format document:', input_file
    	sys.exit()

    # Parse xml files and record urls

    for xml_file in ['styles.xml', 'content.xml']: 

 	try:
 	    xml_contents = zip.read(xml_file)
 	except KeyError:
 	    print '   ERROR: not an OpenOffice.org Open Document Format document:', input_file
	    sys.exit()

	inside_link, inside_frame, inside_heading = False, False, False

	parser = xml.parsers.expat.ParserCreate()
        parser.StartElementHandler = xml_start_element
	parser.CharacterDataHandler = xml_char_data
	parser.EndElementHandler = xml_end_element
 	parser.Parse(xml_contents)	

    zip.close()

    # Run URL checks

    for url in link_type.keys():

        # Keep track of the number of pending URL checks

        lock.acquire()
        thread_count += 1
	lock.release()

        args = url,
        thread.start_new_thread(check_url_threaded, args)

    # Wait for all threads to complete

    while thread_count > 0:
    	  if options.verbose:
    	     print '[INFO] Waiting for URL checks to complete:', thread_count, 'threads left'
    	  time.sleep(1)

    print 'Hyperlink checking complete.'

5.0:	1.1-alt1.1
4.1:	1.1-alt1.1
4.0:	1.0-alt0.1beta