from bs4 import BeautifulSoup, NavigableString
import collections
import multiprocessing
import os
import re
import signal
import sys
class Symbol:
def __init__(self, name, namespace, headers):
self.name = name
self.namespace = namespace
self.headers = headers
def _HasClass(tag, *classes):
for c in tag.get('class', []):
if c in classes:
return True
return False
def _ParseSymbolPage(symbol_page_html, symbol_name):
headers = set()
all_headers = set()
soup = BeautifulSoup(symbol_page_html, "html.parser")
for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
current_headers = []
was_decl = False
for row in table.select('tr'):
if _HasClass(row, 't-dcl', 't-dsc'):
was_decl = True
found_symbols = row.find('td').stripped_strings
if not symbol_name in found_symbols:
continue
headers.update(current_headers)
elif _HasClass(row, 't-dsc-header'):
if was_decl:
current_headers = []
was_decl = False
if not "Defined in header " in row.text:
continue
for header_code in row.find_all("code"):
current_headers.append(header_code.text)
all_headers.add(header_code.text)
return headers or all_headers
def _ParseIndexPage(index_page_html):
symbols = []
soup = BeautifulSoup(index_page_html, "html.parser")
for symbol_href in soup.select("a[title]"):
caption = symbol_href.next_sibling
variant = None
if isinstance(caption, NavigableString) and "(" in caption:
variant = caption.text.strip(" ()")
symbol_tt = symbol_href.find("tt")
if symbol_tt:
symbols.append((symbol_tt.text.rstrip("<>()"), symbol_href["href"], variant))
return symbols
def _ReadSymbolPage(path, name):
with open(path) as f:
return _ParseSymbolPage(f.read(), name)
def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
index_page_path = os.path.join(root_dir, index_page_name)
with open(index_page_path, "r") as f:
results = [] for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
variants_for_symbol = variants_to_accept.get(
(namespace or "") + symbol_name, ())
if variant and variant not in variants_for_symbol:
continue
path = os.path.join(root_dir, symbol_page_path)
results.append((symbol_name,
pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
symbol_headers = collections.defaultdict(set)
for symbol_name, lazy_headers in results:
symbol_headers[symbol_name].update(lazy_headers.get())
symbols = []
for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
symbols.append(Symbol(name, namespace, list(headers)))
return symbols
def GetSymbols(parse_pages):
variants_to_accept = {
"std::remove": ("algorithm"),
}
symbols = []
pool = multiprocessing.Pool(
initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
try:
for root_dir, page_name, namespace in parse_pages:
symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace,
variants_to_accept))
finally:
pool.terminate()
pool.join()
return symbols