Sisyphus repository
Last update: 1 october 2023 | SRPMs: 18631 | Visits: 37838879
en ru br
ALT Linux repos
5.0: 1.1-alt1.1
4.1: 1.1-alt1.1
4.0: 1.0-alt0.1beta

Group :: Office
RPM: coool

 Main   Changelog   Spec   Patches   Sources   Download   Gear   Bugs and FR  Repocop 

#!/usr/bin/env python
#
# coool: Checks OpenOffice.Org Links
# Reports broken hyperlinks in documents
#
# Updates and more information available
# on http://free-electrons.com/community/tools/utils/coool
#
# Version 1.1
# Copyright (C) 2005-2007 Free Electrons
# Author: Michael Opdenacker <michael@free-electrons.com>
# Home page: http://free-electrons.com/community/tools/utils/coool
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
# NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 675 Mass Ave, Cambridge, MA 02139, USA.

import sys, os, zipfile, xml.parsers.expat, urllib2, urllib, urlparse, httplib, thread, time, configparse, string

lock = thread.allocate_lock()
host_lock = thread.allocate_lock()

##########################################################
# Common routines
##########################################################

def debug (s):

# Display debug information if debug mode is set

global options

if options.debug:
print '[DEBUG] ' + s

def repeat_string(s, number):

# Returns a string containing number times the given string

str = ''

for i in range(number):
str += s

return str

def url_failure(url, why):

global link_text, link_type, link_xml_file, input_file, found_errors

# Report the file name the first time
# an error is found on the current file

if not found_errors:
found_errors = True
print input_file
print '-' * len(input_file)

print '[ERROR] URL failure: ' + url
print ' Reason:', why

num_links = len(link_type[url])


if num_links > 1:
print ' ' + str(num_links) + ' links to this URL:'

for i in range(num_links):

print ' ' + string.capwords(link_type[url][i]) + ' link: "'+ link_text[url][i] + '"'

if link_xml_file[url][i] == 'styles.xml':
print ' (found in style data: header, footer, master page...)'


def check_http_url(url):
request=urllib2.Request(url)
opener = urllib2.build_opener()
# Add a browser-like User-Agent. Some sites (like wikipedia) don't seem to accept requests
# with the default urllib2 User-Agent
request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv \u2014 1.7.8) Gecko/20050511')

try:
opener.open(request).read()
except urllib2.HTTPError, why:
url_failure(url, why)
except urllib2.URLError, why:
#Doesn't work in all cases
#url_failure(url, why.__getitem__(0)[1])
url_failure(url, why)
except httplib.BadStatusLine, why:
url_failure(url, why)

def check_ftp_url(url):
# urllib2 doesn't raise any exception when a
# ftp url for an invalid file is given
# Only invalid domains are reported
# That's why we are using urllib.urlretrieve

try:
tmpfile = urllib.urlretrieve(url)[0]
except IOError, why:
#Doesn't work in all cases
#url_failure(url, why.__getitem__(1)[1])
url_failure(url, why)
else:
# With a non-existing file on a ftp URL,
# we get a zero size output file
# Even if a real zero size file exists,
# it's worth highlighting anyway
# (no point in making an hyperlink to it)
if os.path.getsize(tmpfile) == 0:
url_failure(url, 'Non existing or empty file')
os.remove(tmpfile)

def check_internal_link(url):
# Checks whether the internal link corresponds
# to a correct reference.

global internal_targets

target = url[1:]

# Remove a trailing ' (Notes)' substring if any
# This assumes that if the page exists, its notes page
# exists too.

if target.endswith(' (Notes)'):
target = target[:-8]

if target not in internal_targets:
url_failure(url, 'No such target within document')


def check_url(url):

debug('Checking link: ' + url)

protocol = urlparse.urlparse(url)[0]

if url[0] == '#':
check_internal_link(url)
elif protocol == 'http' or protocol == 'https':
check_http_url(url)
elif protocol == 'ftp':
check_ftp_url(url)
else:
# Still try to handle other protocols
try:
urllib2.urlopen(url)
except:
url_failure(url, 'Unknown - Please report this to the developers of this script!')

def get_hostname(url):

return urlparse.urlparse(url)[1]

def url_is_ignored(url):

global options

hostname = get_hostname(url)
return (options.exclude_hosts.count(hostname))

def check_url_threaded(url):

global lock, host_lock, thread_count, tokens_per_host, options

# Check that we don't run too many parallel checks per host
# That could bring the host down or at least be considered
# as the Denial of Service attack.

hostname = get_hostname(url)

if options.verbose:
print '[INFO] Checking hyperlink:', url

# Counting parallel requests to the same host.
# Of course, ignore local links

if hostname != '':

wait = True

host_lock.acquire()

if not tokens_per_host.has_key(hostname):
tokens_per_host[hostname] = options.max_requests_per_host

host_lock.release()

while wait:
host_lock.acquire()
if tokens_per_host[hostname] > 0:
tokens_per_host[hostname] -= 1
host_lock.release()
wait = False
else:
host_lock.release()
time.sleep(1)

# Do the URL check!

check_url(url)

# The thread is about to complete, decrement the counters

lock.acquire()
thread_count -= 1
lock.release()

if hostname != '':
host_lock.acquire()
tokens_per_host[hostname] += 1
host_lock.release()

def xml_start_element(name, attributes):

# Called by the xml parser on each xml start tag

global link_text, link_type, link_xml_file, xml_file, current_url, inside_link, internal_targets, inside_frame, current_frame
global inside_heading, heading_level

# Process hyperlinks

if attributes.has_key('xlink:href') and (name == 'text:a' or name == 'form:button' or name == 'draw:a'):

inside_link = True
current_url = attributes['xlink:href']

if not link_text.has_key(current_url):
link_text[current_url], link_type[current_url], link_xml_file[current_url] = [], [], []

link_xml_file[current_url].append(xml_file)

if name == 'text:a':
link_type[current_url].append('text')
elif name == 'form:button':
link_text[current_url].append(attributes['form:label'])
link_type[current_url].append('button')
elif name == 'draw:a':
link_text[current_url].append(attributes['office:name'])
link_type[current_url].append('image')

# Presentation: record page names, to which internal links can be made

elif name == 'draw:page':
internal_targets.add(attributes['draw:name'])

# Text documents: things to which internal links can be made

elif name == 'text:bookmark':
internal_targets.add(attributes['text:name'])
elif name == 'text:section':
internal_targets.add(attributes['text:name'] + '|region')
elif name == 'draw:frame' and attributes.has_key('draw:name'):
current_frame = attributes['draw:name']
inside_frame = True
elif name == 'table:table':
internal_targets.add(attributes['table:name']+ '|table')
elif name == 'text:h':
inside_heading = True
heading_level = int(attributes['text:outline-level'])
elif inside_frame:
if name == 'draw:image':
internal_targets.add(current_frame + '|graphic')
elif name == 'draw:text-box':
internal_targets.add(current_frame + '|frame')
elif name == 'draw:object':
internal_targets.add(current_frame + '|ole')

def xml_char_data(data):

global current_url, current_element, link_type, link_text
global inside_heading, heading_level, internal_targets

if inside_link:
if link_type[current_url][-1] == 'text':

# Truncate link text if too long

if len(data) > 77:
data = data[:77] + '...'

link_text[current_url].append(data)

if inside_heading:
internal_targets.add(repeat_string('0.', heading_level,) + data + '|outline')

def xml_end_element(name):

global inside_link, inside_frame, inside_heading

if inside_link and (name == 'text:a' or name == 'form:button' or name == 'draw:a'):
inside_link = False
elif inside_frame and (name == 'draw:frame'):
inside_frame = False
elif inside_heading and (name == 'text:h'):
inside_heading = False


##########################################################
# Main program
##########################################################

# Command parameters
# Either default values, found in a configuration file
# or on the command line
#

usage = 'usage: %prog [options] [OpenOffice.org document files]'
description = 'Checks OpenOffice.org documents for broken Links'

optparser = configparse.OptionParser(usage=usage, version='coool 1.0', description=description)

optparser.add_option('-v', '--verbose',
config='true',
action='store_true', dest='verbose', default=False,
help='display progress information')

optparser.add_option('-d', '--debug',
config='true',
action='store_true', dest='debug', default=False,
help='display debug information')

optparser.add_option('-t', '--max-threads',
config='true',
action='store', type='int', dest='max_threads', default=100,
help='set the maximum number of parallel threads to create')

optparser.add_option('-r', '--max-requests-per-host',
config='true',
action='store', type='int', dest='max_requests_per_host', default=5,
help='set the maximum number of parallel requests per host')

optparser.add_option('-x', '--exclude-hosts',
config='true',
action='store', type='string', dest='exclude_hosts', default='',
help='ignore urls which host name belongs to the given list')

(options, args) = optparser.parse_args(files=[os.path.expanduser("~/.cooolrc")])

if len(args) == 0:
print 'No files to check. Exiting.'
sys.exit()

# Turn options.exclude_hosts into a list, for exact matching

options.exclude_hosts = options.exclude_hosts.split()

# Iterate on all given documents

for input_file in args:

if options.verbose:
print '[INFO] Checking links in file', input_file, '...'

link_text = dict()
link_type = dict()
link_xml_file = dict()
internal_targets = set()
thread_count = 0
tokens_per_host = dict()

# Unzip the OpenOffice (Open Document Format) document

try:
zip = zipfile.ZipFile(input_file, 'r')
except IOError, (errno, why):
print ' ERROR: Cannot open file', input_file, ':', why
sys.exit()
except zipfile.BadZipfile:
print ' ERROR: not an OpenOffice.org Open Document Format document:', input_file
sys.exit()

# Parse xml files and record urls

global xml_file

for xml_file in ['styles.xml', 'content.xml']:

try:
xml_contents = zip.read(xml_file)
except KeyError:
print ' ERROR: not an OpenOffice.org Open Document Format document:', input_file
sys.exit()

inside_link, inside_frame, inside_heading = False, False, False

parser = xml.parsers.expat.ParserCreate()
parser.StartElementHandler = xml_start_element
parser.CharacterDataHandler = xml_char_data
parser.EndElementHandler = xml_end_element
parser.Parse(xml_contents)

zip.close()

# Run URL checks

found_errors = False

for url in link_type.keys():

if not url_is_ignored(url):

# Do not exceed the max number of threads

while thread_count > options.max_threads:
time.sleep(1)

# Keep track of the number of pending URL checks
# It will be up to the checking thread to decrease thread_count

lock.acquire()
thread_count += 1
lock.release()

args = url,
thread.start_new_thread(check_url_threaded, args)

# Wait for all threads to complete

while thread_count > 0:
if options.verbose:
print '[INFO] Waiting for URL checks to complete:', thread_count, 'threads left'
time.sleep(1)

if options.verbose:
print '[INFO] Hyperlink checking complete.'
 
design & coding: Vladimir Lettiev aka crux © 2004-2005, Andrew Avramenko aka liks © 2007-2008
current maintainer: Michael Shigorin