173 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			173 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
	
#!/usr/bin/env python
 | 
						|
#===- cppreference_parser.py -  ------------------------------*- python -*--===#
 | 
						|
#
 | 
						|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | 
						|
# See https://llvm.org/LICENSE.txt for license information.
 | 
						|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | 
						|
#
 | 
						|
#===------------------------------------------------------------------------===#
 | 
						|
 | 
						|
from bs4 import BeautifulSoup, NavigableString
 | 
						|
 | 
						|
import collections
 | 
						|
import multiprocessing
 | 
						|
import os
 | 
						|
import re
 | 
						|
import signal
 | 
						|
import sys
 | 
						|
 | 
						|
 | 
						|
class Symbol:
 | 
						|
 | 
						|
  def __init__(self, name, namespace, headers):
 | 
						|
    # unqualifed symbol name, e.g. "move"
 | 
						|
    self.name = name
 | 
						|
    # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
 | 
						|
    # None for C symbols.
 | 
						|
    self.namespace = namespace
 | 
						|
    # a list of corresponding headers
 | 
						|
    self.headers = headers
 | 
						|
 | 
						|
 | 
						|
def _HasClass(tag, *classes):
 | 
						|
  for c in tag.get('class', []):
 | 
						|
    if c in classes:
 | 
						|
      return True
 | 
						|
  return False
 | 
						|
 | 
						|
 | 
						|
def _ParseSymbolPage(symbol_page_html, symbol_name):
 | 
						|
  """Parse symbol page and retrieve the include header defined in this page.
 | 
						|
  The symbol page provides header for the symbol, specifically in
 | 
						|
  "Defined in header <header>" section. An example:
 | 
						|
 | 
						|
  <tr class="t-dsc-header">
 | 
						|
    <td colspan="2"> <div>Defined in header <code><ratio></code> </div>
 | 
						|
  </td></tr>
 | 
						|
 | 
						|
  Returns a list of headers.
 | 
						|
  """
 | 
						|
  headers = set()
 | 
						|
  all_headers = set()
 | 
						|
 | 
						|
  soup = BeautifulSoup(symbol_page_html, "html.parser")
 | 
						|
  # Rows in table are like:
 | 
						|
  #   Defined in header <foo>      .t-dsc-header
 | 
						|
  #   Defined in header <bar>      .t-dsc-header
 | 
						|
  #   decl1                        .t-dcl
 | 
						|
  #   Defined in header <baz>      .t-dsc-header
 | 
						|
  #   decl2                        .t-dcl
 | 
						|
  for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
 | 
						|
    current_headers = []
 | 
						|
    was_decl = False
 | 
						|
    for row in table.select('tr'):
 | 
						|
      if _HasClass(row, 't-dcl', 't-dsc'):
 | 
						|
        was_decl = True
 | 
						|
        # Symbols are in the first cell.
 | 
						|
        found_symbols = row.find('td').stripped_strings
 | 
						|
        if not symbol_name in found_symbols:
 | 
						|
          continue
 | 
						|
        headers.update(current_headers)
 | 
						|
      elif _HasClass(row, 't-dsc-header'):
 | 
						|
        # If we saw a decl since the last header, this is a new block of headers
 | 
						|
        # for a new block of decls.
 | 
						|
        if was_decl:
 | 
						|
          current_headers = []
 | 
						|
        was_decl = False
 | 
						|
        # There are also .t-dsc-header for "defined in namespace".
 | 
						|
        if not "Defined in header " in row.text:
 | 
						|
          continue
 | 
						|
        # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
 | 
						|
        for header_code in row.find_all("code"):
 | 
						|
          current_headers.append(header_code.text)
 | 
						|
          all_headers.add(header_code.text)
 | 
						|
  # If the symbol was never named, consider all named headers.
 | 
						|
  return headers or all_headers
 | 
						|
 | 
						|
 | 
						|
def _ParseIndexPage(index_page_html):
 | 
						|
  """Parse index page.
 | 
						|
  The index page lists all std symbols and hrefs to their detailed pages
 | 
						|
  (which contain the defined header). An example:
 | 
						|
 | 
						|
  <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
 | 
						|
  <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
 | 
						|
 | 
						|
  Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
 | 
						|
  """
 | 
						|
  symbols = []
 | 
						|
  soup = BeautifulSoup(index_page_html, "html.parser")
 | 
						|
  for symbol_href in soup.select("a[title]"):
 | 
						|
    # Ignore annotated symbols like "acos<>() (std::complex)".
 | 
						|
    # These tend to be overloads, and we the primary is more useful.
 | 
						|
    # This accidentally accepts begin/end despite the (iterator) caption: the
 | 
						|
    # (since C++11) note is first. They are good symbols, so the bug is unfixed.
 | 
						|
    caption = symbol_href.next_sibling
 | 
						|
    variant = isinstance(caption, NavigableString) and "(" in caption
 | 
						|
    symbol_tt = symbol_href.find("tt")
 | 
						|
    if symbol_tt:
 | 
						|
      symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
 | 
						|
                      symbol_href["href"], variant))
 | 
						|
  return symbols
 | 
						|
 | 
						|
 | 
						|
def _ReadSymbolPage(path, name):
 | 
						|
  with open(path) as f:
 | 
						|
    return _ParseSymbolPage(f.read(), name)
 | 
						|
 | 
						|
 | 
						|
def _GetSymbols(pool, root_dir, index_page_name, namespace):
 | 
						|
  """Get all symbols listed in the index page. All symbols should be in the
 | 
						|
  given namespace.
 | 
						|
 | 
						|
  Returns a list of Symbols.
 | 
						|
  """
 | 
						|
 | 
						|
  # Workflow steps:
 | 
						|
  #   1. Parse index page which lists all symbols to get symbol
 | 
						|
  #      name (unqualified name) and its href link to the symbol page which
 | 
						|
  #      contains the defined header.
 | 
						|
  #   2. Parse the symbol page to get the defined header.
 | 
						|
  index_page_path = os.path.join(root_dir, index_page_name)
 | 
						|
  with open(index_page_path, "r") as f:
 | 
						|
    # Read each symbol page in parallel.
 | 
						|
    results = [] # (symbol_name, promise of [header...])
 | 
						|
    for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
 | 
						|
      # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
 | 
						|
      # FIXME: use these as a fallback rather than ignoring entirely.
 | 
						|
      if variant:
 | 
						|
        continue
 | 
						|
      path = os.path.join(root_dir, symbol_page_path)
 | 
						|
      results.append((symbol_name,
 | 
						|
                      pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
 | 
						|
 | 
						|
    # Build map from symbol name to a set of headers.
 | 
						|
    symbol_headers = collections.defaultdict(set)
 | 
						|
    for symbol_name, lazy_headers in results:
 | 
						|
      symbol_headers[symbol_name].update(lazy_headers.get())
 | 
						|
 | 
						|
  symbols = []
 | 
						|
  for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
 | 
						|
    symbols.append(Symbol(name, namespace, list(headers)))
 | 
						|
  return symbols
 | 
						|
 | 
						|
 | 
						|
def GetSymbols(parse_pages):
 | 
						|
  """Get all symbols by parsing the given pages.
 | 
						|
 | 
						|
  Args:
 | 
						|
    parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
 | 
						|
  """
 | 
						|
  symbols = []
 | 
						|
  # Run many workers to process individual symbol pages under the symbol index.
 | 
						|
  # Don't allow workers to capture Ctrl-C.
 | 
						|
  pool = multiprocessing.Pool(
 | 
						|
      initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
 | 
						|
  try:
 | 
						|
    for root_dir, page_name, namespace in parse_pages:
 | 
						|
      symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
 | 
						|
  finally:
 | 
						|
    pool.terminate()
 | 
						|
    pool.join()
 | 
						|
  return symbols
 |