305 lines
9.2 KiB
Python
Executable File
305 lines
9.2 KiB
Python
Executable File
#!/usr/bin/python3
|
|
|
|
"""
|
|
Inspect a set of markdown files, and warn if there are:
|
|
- duplicate links
|
|
- malformed links
|
|
"""
|
|
|
|
import argparse
|
|
import bs4
|
|
import logging
|
|
import markdown
|
|
import os
|
|
import re
|
|
import sys
|
|
import urllib.parse
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
LOG.setLevel(logging.INFO)
|
|
|
|
|
|
class Warnings:
|
|
""" A singleton object for gathering warnings to be printed later. """
|
|
|
|
def __init__(self):
|
|
self.warnings = []
|
|
self.silent = False
|
|
|
|
def silence(self, val):
|
|
self.silent = val
|
|
|
|
def warn(self, msg):
|
|
if not self.silent:
|
|
self.warnings.append(msg)
|
|
|
|
def get(self):
|
|
return self.warnings
|
|
|
|
|
|
# The singleton object that gathers warnings, for later reporting.
|
|
warnings = Warnings()
|
|
|
|
# A regex that matches filenames to inspect.
|
|
RE_FILENAME = re.compile(r'\d\d\d\d-\d\d-\d\d-this-week-in-rust.md$')
|
|
|
|
# A block-list of tracking parameters
|
|
TRACKING_PARAMETERS = set([
|
|
'utm_source',
|
|
'utm_campaign',
|
|
'utm_medium',
|
|
'utm_content',
|
|
])
|
|
|
|
# A list of section titles that will trigger duplicate-tag detection.
|
|
STRICT_TITLES = [
|
|
'updates from rust community',
|
|
]
|
|
|
|
|
|
def is_strict_title(title):
|
|
""" Return True if this title is one that needs strict checks. """
|
|
title = str(title)
|
|
# .lower() doesn't necessarily handle unicode in a robust way,
|
|
# but the set of strings we care about is tiny, and use only ascii.
|
|
return title.lower() in STRICT_TITLES
|
|
|
|
|
|
def check_truncated_title(tag):
|
|
""" Flag any links that have a probably-truncated title.
|
|
|
|
Links collected from Discord may be truncated to a length of exactly
|
|
70 characters, including a "..." suffix.
|
|
|
|
If we're unlucky enough to trigger this warning by mistake, here are
|
|
some workarounds:
|
|
- Make any change to the title so that it's not exactly 70 characters
|
|
(e.g. add an extra space between words)
|
|
- Replace the "..." with unicode "…"
|
|
"""
|
|
title = tag.string
|
|
LOG.debug(f'link title: {repr(title)}')
|
|
if title and title.endswith('...') and len(title) == 70:
|
|
warnings.warn(f'truncated link title: {repr(title)}')
|
|
|
|
|
|
def extract_links(html):
|
|
""" Return a list of links from this file.
|
|
|
|
Links will only be returned if they are within a section deemed "strict".
|
|
This allows us to ignore links that are deliberately repeated (to this
|
|
github repo and twitter account, for example).
|
|
|
|
Side-effects:
|
|
- If links are malformed, warnings may be recorded. See `parse_url`
|
|
for details.
|
|
|
|
"""
|
|
strict_mode = False
|
|
tags = ['a', 'h1', 'h2', 'h3', 'h4']
|
|
urls = []
|
|
|
|
# Remember the header level (h2, h3, etc) when we turned on
|
|
# strict_mode.
|
|
header_level = None
|
|
|
|
for tag in bs4.BeautifulSoup(html, 'html.parser').find_all(tags):
|
|
if tag.name == 'a':
|
|
link = tag.get('href')
|
|
LOG.debug(f'found link tag: {link}')
|
|
if strict_mode:
|
|
check_truncated_title(tag)
|
|
trimmed_url = parse_url(link)
|
|
urls.append(trimmed_url)
|
|
else:
|
|
level = tag.name
|
|
if header_level and level > header_level:
|
|
LOG.debug(f'skipping {tag}, overridden at {header_level}')
|
|
continue
|
|
|
|
# This is the title of a section. If this title is "strict",
|
|
# we will check for any duplicate links inside it.
|
|
|
|
strict_mode = is_strict_title(tag.string)
|
|
if strict_mode:
|
|
header_level = level
|
|
else:
|
|
header_level = None
|
|
LOG.debug(f'found heading tag: {tag} (strict={strict_mode})')
|
|
|
|
return urls
|
|
|
|
|
|
def scrub_parameters(url, query):
|
|
""" Strip tracking parameters from the URL """
|
|
query_dict = urllib.parse.parse_qs(query)
|
|
|
|
filtered_dict = {}
|
|
found_tracking = []
|
|
for k, v in query_dict.items():
|
|
if k in TRACKING_PARAMETERS:
|
|
found_tracking.append(k)
|
|
else:
|
|
filtered_dict[k] = v
|
|
|
|
# Store a warning if
|
|
if found_tracking:
|
|
warnings.warn(f'found tracking parameters on {url}: {found_tracking}')
|
|
|
|
# If there are no query parameters left, return the empty string.
|
|
if not filtered_dict:
|
|
return ''
|
|
|
|
# Re-encode remaining URL parameters
|
|
return urllib.parse.urlencode(filtered_dict, doseq=True)
|
|
|
|
|
|
def parse_url(link):
|
|
""" Parse a URL and return it in a stripped-down form.
|
|
|
|
In an effort to detect duplicate URLs, some information is removed:
|
|
- tracking parameters are removed (see `scrub_parameters`)
|
|
- "http" and "https" URLs are considered the same.
|
|
- consecutive slashes and trailing slashes are ignored.
|
|
|
|
Warnings may be issued if unnecessary tracking parameters are found,
|
|
or if the URL contains consecutive slashes.
|
|
"""
|
|
parsed_url = urllib.parse.urlsplit(link)
|
|
|
|
scheme = parsed_url.scheme
|
|
if scheme not in ('mailto', 'http', 'https'):
|
|
warnings.warn(f'possibly malformed link: {link}')
|
|
if scheme == 'http':
|
|
scheme = 'https'
|
|
|
|
# If there are query parameters present, give them a cleanup pass to remove irrelevant ones.
|
|
query = parsed_url.query
|
|
if query:
|
|
LOG.debug(f'{parsed_url.geturl()} found query parameters: {query}')
|
|
query = scrub_parameters(link, query)
|
|
if query:
|
|
LOG.debug(
|
|
f'{parsed_url.geturl()} keeping query parameters: {query}')
|
|
|
|
# Remove consecutive slashes, because https://path/to////file and http://path/to/file are the same.
|
|
path = parsed_url.path
|
|
path_components = path.split('/')
|
|
trailing_slash = path_components[-1] == ''
|
|
path_components = [s for s in path_components if s]
|
|
path = '/'.join(path_components)
|
|
if trailing_slash:
|
|
path += '/'
|
|
|
|
# Re-constitute the URL with any simplifications that should trigger a warning.
|
|
(sch, loc, _, _, frag) = parsed_url
|
|
reconstituted = urllib.parse.urlunsplit((sch, loc, path, query, frag))
|
|
if reconstituted != link:
|
|
LOG.debug(f'reconstituted: {reconstituted}')
|
|
warnings.warn(f'link can be simplified: {link} -> {reconstituted}')
|
|
|
|
# Strip any trailing slashes, again.
|
|
path = path.rstrip('/')
|
|
|
|
# Re-constitute a second time, including more simplifications that we don't
|
|
# need to warn about
|
|
reconstituted = urllib.parse.urlunsplit((scheme, loc, path, query, frag))
|
|
|
|
return reconstituted
|
|
|
|
|
|
def inspect_file(filename):
|
|
LOG.info(f'inspecting file {filename}')
|
|
md_text = open(filename).read()
|
|
html = markdown.markdown(md_text)
|
|
links = extract_links(html)
|
|
LOG.debug(f'examining {len(links)} links')
|
|
return links
|
|
|
|
|
|
def get_recent_files(dirs, count):
|
|
""" return a list of the N most recent markdown files in `dir`.
|
|
|
|
We assume the files are named "YYYY-MM-DD-this-week-in-rust-md".
|
|
"""
|
|
LOG.debug(f'searching for {count} recent files in "{dirs}"')
|
|
|
|
listing = []
|
|
for dir in dirs.split(':'):
|
|
files = os.listdir(path=dir)
|
|
if not files:
|
|
raise Exception(f'No files found in {dir}')
|
|
files = list(filter(RE_FILENAME.match, files))
|
|
if not files:
|
|
raise Exception(f'No matching files found in {dir}')
|
|
|
|
# create a tuple (file, file+path) so we can sort by filename
|
|
file_tuples = [(f, os.path.join(dir, f)) for f in files]
|
|
listing.extend(file_tuples)
|
|
|
|
listing.sort()
|
|
listing = listing[-count:]
|
|
|
|
# return the file+path.
|
|
listing = [tup[1] for tup in listing]
|
|
|
|
LOG.info(f'recent files: {listing}')
|
|
return listing
|
|
|
|
|
|
def inspect_files(file_list, num_warn):
|
|
""" Inspect a set of files, storing warnings about duplicate links. """
|
|
linkset = {}
|
|
|
|
# If we inspect 5 files (enumerated 0-4), and want to warn on 2,
|
|
# then the warnings start at N=3 (length - 1 - num_warn).
|
|
warn_index = len(file_list) - 1 - num_warn
|
|
|
|
for index, file in enumerate(file_list):
|
|
warnings.silence(index < warn_index)
|
|
links = inspect_file(file)
|
|
LOG.debug(f'found links: {links}')
|
|
for link in links:
|
|
collision = linkset.get(link)
|
|
if collision:
|
|
warnings.warn(
|
|
f"possible duplicate link {link} in file {file} (also found in {collision}")
|
|
else:
|
|
linkset[link] = file
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--paths', default='content:draft',
|
|
help="Directory paths to inspect (colon separated)")
|
|
parser.add_argument('--num-recent', default=25, type=int,
|
|
help="Number of recent files to inspect")
|
|
parser.add_argument('--num-warn', default=1, type=int,
|
|
help="Number of recent files to warn about")
|
|
parser.add_argument('--debug', action='store_true')
|
|
args = parser.parse_args()
|
|
if args.debug:
|
|
LOG.setLevel(logging.DEBUG)
|
|
LOG.debug(f'command-line arguments: {args}')
|
|
file_list = get_recent_files(args.paths, args.num_recent)
|
|
inspect_files(file_list, args.num_warn)
|
|
|
|
|
|
def setup_logging():
|
|
log_stdout = logging.StreamHandler(sys.stdout)
|
|
logging.getLogger('').addHandler(log_stdout)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
setup_logging()
|
|
main()
|
|
|
|
warns = warnings.get()
|
|
if warns:
|
|
print("warnings exist:")
|
|
for w in warns:
|
|
print(w)
|
|
sys.exit(1)
|
|
else:
|
|
print("everything is ok!")
|