libvirt/scripts/check-html-references.py

#!/usr/bin/env python3
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library.  If not, see
# <http://www.gnu.org/licenses/>.
#
# Check that external references between documentation HTML files are not broken.

import sys
import os
import argparse
import re
import xml.etree.ElementTree as ET

ns = {'html': 'http://www.w3.org/1999/xhtml'}
externallinks = []


def get_file_list(prefix):
    filelist = []

    for root, dir, files in os.walk(prefix):
        prefixbase = os.path.dirname(prefix)

        if root.startswith(prefixbase):
            relroot = root[len(prefixbase):]
        else:
            relroot = root

        for file in files:
            if not re.search('\\.html$', file):
                continue

            # the 404 page doesn't play well
            if '404.html' in file:
                continue

            fullfilename = os.path.join(root, file)
            relfilename = os.path.join(relroot, file)
            filelist.append((fullfilename, relfilename))

    return filelist


# loads an XHTML and extracts all anchors, local and remote links for the one file
def process_file(filetuple):
    filename, relfilename = filetuple
    tree = ET.parse(filename)
    root = tree.getroot()

    anchors = [relfilename]
    targets = []

    for elem in root.findall('.//html:a', ns):
        target = elem.get('href')
        an = elem.get('id')

        if an:
            anchors.append(relfilename + '#' + an)

        if target:
            if re.search('://', target):
                externallinks.append(target)
            elif target[0] != '#' and 'mailto:' not in target:
                dirname = os.path.dirname(relfilename)
                targetname = os.path.normpath(os.path.join(dirname, target))

                targets.append((targetname, filename, target))

    # older docutils generate "<div class='section'"
    for elem in root.findall('.//html:div/[@class=\'section\']', ns):
        an = elem.get('id')

        if an:
            anchors.append(relfilename + '#' + an)

    # modern docutils generate a <section element
    for elem in root.findall('.//html:section', ns):
        an = elem.get('id')

        if an:
            anchors.append(relfilename + '#' + an)

    return (anchors, targets)


def process_all(filelist):
    anchors = []
    targets = []

    for filetuple in filelist:
        anchor, target = process_file(filetuple)

        targets = targets + target
        anchors = anchors + anchor

    return (targets, anchors)


def check_targets(targets, anchors):
    errors = []
    for target, targetfrom, targetorig in targets:
        if target not in anchors:
            errors.append((targetfrom, targetorig))

    if errors:
        errors.sort()

        print('broken link targets:')

        for file, target in errors:
            print(file + " broken link: " + target)

        return True

    return False


parser = argparse.ArgumentParser(description='HTML reference checker')
parser.add_argument('--prefix', default='.',
                    help='build tree prefix')
parser.add_argument('--external', action="store_true",
                    help='print external references instead')

args = parser.parse_args()

files = get_file_list(args.prefix)

targets, anchors = process_all(files)

if args.external:
    prev = None
    externallinks.sort()
    for ext in externallinks:
        if ext != prev:
            print(ext)

        prev = ext
else:
    if check_targets(targets, anchors):
        sys.exit(1)

    sys.exit(0)
Import Upstream version 8.5.0 2022-11-08 17:23:04 +08:00			`#!/usr/bin/env python3`
			`#`
			`# This library is free software; you can redistribute it and/or`
			`# modify it under the terms of the GNU Lesser General Public`
			`# License as published by the Free Software Foundation; either`
			`# version 2.1 of the License, or (at your option) any later version.`
			`#`
			`# This library is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`# Lesser General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public`
			`# License along with this library. If not, see`
			`# <http://www.gnu.org/licenses/>.`
			`#`
			`# Check that external references between documentation HTML files are not broken.`

			`import sys`
			`import os`
			`import argparse`
			`import re`
			`import xml.etree.ElementTree as ET`

			`ns = {'html': 'http://www.w3.org/1999/xhtml'}`
			`externallinks = []`


			`def get_file_list(prefix):`
			`filelist = []`

			`for root, dir, files in os.walk(prefix):`
			`prefixbase = os.path.dirname(prefix)`

			`if root.startswith(prefixbase):`
			`relroot = root[len(prefixbase):]`
			`else:`
			`relroot = root`

			`for file in files:`
			`if not re.search('\\.html$', file):`
			`continue`

			`# the 404 page doesn't play well`
			`if '404.html' in file:`
			`continue`

			`fullfilename = os.path.join(root, file)`
			`relfilename = os.path.join(relroot, file)`
			`filelist.append((fullfilename, relfilename))`

			`return filelist`


			`# loads an XHTML and extracts all anchors, local and remote links for the one file`
			`def process_file(filetuple):`
			`filename, relfilename = filetuple`
			`tree = ET.parse(filename)`
			`root = tree.getroot()`

			`anchors = [relfilename]`
			`targets = []`

			`for elem in root.findall('.//html:a', ns):`
			`target = elem.get('href')`
			`an = elem.get('id')`

			`if an:`
			`anchors.append(relfilename + '#' + an)`

			`if target:`
			`if re.search('://', target):`
			`externallinks.append(target)`
			`elif target[0] != '#' and 'mailto:' not in target:`
			`dirname = os.path.dirname(relfilename)`
			`targetname = os.path.normpath(os.path.join(dirname, target))`

			`targets.append((targetname, filename, target))`

			`# older docutils generate "<div class='section'"`
			`for elem in root.findall('.//html:div/[@class=\'section\']', ns):`
			`an = elem.get('id')`

			`if an:`
			`anchors.append(relfilename + '#' + an)`

			`# modern docutils generate a <section element`
			`for elem in root.findall('.//html:section', ns):`
			`an = elem.get('id')`

			`if an:`
			`anchors.append(relfilename + '#' + an)`

			`return (anchors, targets)`


			`def process_all(filelist):`
			`anchors = []`
			`targets = []`

			`for filetuple in filelist:`
			`anchor, target = process_file(filetuple)`

			`targets = targets + target`
			`anchors = anchors + anchor`

			`return (targets, anchors)`


			`def check_targets(targets, anchors):`
			`errors = []`
			`for target, targetfrom, targetorig in targets:`
			`if target not in anchors:`
			`errors.append((targetfrom, targetorig))`

			`if errors:`
			`errors.sort()`

			`print('broken link targets:')`

			`for file, target in errors:`
			`print(file + " broken link: " + target)`

			`return True`

			`return False`


			`parser = argparse.ArgumentParser(description='HTML reference checker')`
			`parser.add_argument('--prefix', default='.',`
			`help='build tree prefix')`
			`parser.add_argument('--external', action="store_true",`
			`help='print external references instead')`

			`args = parser.parse_args()`

			`files = get_file_list(args.prefix)`

			`targets, anchors = process_all(files)`

			`if args.external:`
			`prev = None`
			`externallinks.sort()`
			`for ext in externallinks:`
			`if ext != prev:`
			`print(ext)`

			`prev = ext`
			`else:`
			`if check_targets(targets, anchors):`
			`sys.exit(1)`

			`sys.exit(0)`