manithree/test-pijul1: src/utfastatutes

import httpclient, nre, options, json,
       strutils, strformat, strtabs,
       xmlparser, xmltree, times, os


var indentWidth = 3

proc generateHtmlHeading(element: XmlNode, indent: int, htmlToc: var string, htmlBody: var string, faLinks: var StringTableRef) =
  var number = element.attr("number")
  htmlToc &= r"<a style=""margin-left: " & intToStr((indent-1)*indentWidth) & r"%;"" href=""#_" & number & "\"" & r">" & number & " " & innerText(child(element, "catchline")) & r"</a><br/>" & "\n"
  faLinks[number] = number
  htmlBody &= r"<h" & intToStr(indent) & r" id=""_" & number & "\"" & r">" & number & " " & innerText(child(element, "catchline")) & r"</h" & intToStr(indent) & r">" & "\n"
  var effdate = child(element, "effdate")
  if not effdate.isNil():
    htmlBody &= r"<p style=""display: block; margin-left: " & intToStr((indent-1)*indentWidth) & r"%;"">(Effective " & innerText(effdate) & r")</p>" & "\n"

proc convertSubsectionToHtml(element: XmlNode, parentNumber: string, htmlToc: var string, htmlBody: var string, indent: int, faLinks: var StringTableRef) =
  var number = element.attr("number")
  var dispNum = number[parentNumber.len..^1]
  htmlBody &= r"<p id=" & "\"_" & number & "\" " & r"style=""display: block; margin-left: " & intToStr((indent-1)*indentWidth) & r"%;"">" & dispNum & " "
  faLinks[number] = number
  for child in element:
    if child.kind() == xnElement:
      case child.tag():
        of "xref":
          var childNum = child.attr("refnumber")
          var href = r"#_" & childNum
          htmlBody &= r"<a href=" & "\"" & href & "\"" & r">" & innerText(child) & r"</a>"
          if not faLinks.hasKey(childNum):
            faLinks[childNum] = ""
        of "subsection":
          convertSubsectionToHtml(child, number, htmlToc, htmlBody, indent+1, faLinks)
        of "histories":
          echo "skipping histories"
        of "catchline":
          echo "skipping catchline"
        of "tab":
          echo "skipping tab"
        else:
          echo "Unrecognized node type: " & child.tag()
    else:
      if child.kind() == xnText:
        htmlBody &= innerText(child) & " "
  htmlBody &= r"</p>" & "\n"

proc convertSectionToHtml(element: XmlNode, htmlToc: var string, htmlBody: var string, faLinks: var StringTableRef) =
  generateHtmlHeading(element, 3, htmlToc, htmlBody, faLinks)
  var count = 0

  var number = element.attr("number")
  for subsection in items(element):
    if subsection.kind == xnElement and subsection.tag == "subsection":
      convertSubsectionToHtml(subsection, number, htmlToc, htmlBody, 3, faLinks)
      count += 1
  if count < 1:
    # treat as a subsection if we didn't find any subsections
    convertSubsectionToHtml(element, "", htmlToc, htmlBody, 3, faLinks)

proc convertChapterToHtml(element: XmlNode, htmlToc: var string, htmlBody: var string, faLinks: var StringTableRef) =
  generateHtmlHeading(element, 1, htmlToc, htmlBody, faLinks)
  for section in findAll(element, "section"):
    convertSectionToHtml(section, htmlToc, htmlBody, faLinks)
  htmlBody &= r"<hr/>" & "\n"

proc convertPartToHtml(element: XmlNode, htmlToc: var string, htmlBody: var string, faLinks: var StringTableRef) =
  generateHtmlHeading(element, 2, htmlToc, htmlBody, faLinks)
  for section in findAll(element, "section"):
    convertSectionToHtml(section, htmlToc, htmlBody, faLinks)
  htmlBody &= r"<br/>" & "\n"



proc convertXmlToHtml(statute: string, htmlToc: var string, htmlBody: var string, faLinks: var StringTableRef) =
  var tree = parseXml(statute)
  if tree.kind() == xnELement:
    case tree.tag():
      of "chapter":
        convertChapterToHtml(tree, htmlToc, htmlBody, faLinks)
      of "part":
        convertPartToHtml(tree, htmlToc, htmlBody, faLinks)
      of "section":
        convertSectionToHtml(tree, htmlToc, htmlBody, faLinks)
      else:
        echo "Unknown root element tag: " & tree.tag()
  else:
    echo "Unknown root element kind: "
    echo tree.kind()

proc writeHtml(fileName: string, htmlToc: string, htmlBody: string, page_title: string) =
  var preamble =  """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
"""
  preamble &= fmt"<title>{page_title} {getDateStr()}</title>"
  preamble &= """
<style>
html,body{font-size:100%}
h1{font-size:2.125em; margin-left: 0%}
h2{font-size:1.6875em; margin-left: 3%}
h3{font-size:1.375em; margin-left: 6%}
h4{font-size:1.125em; margin-left: 9%}
h5{font-size:1.125em; margin-left: 12%}
h6{font-size:1em; margin-left: 15%}
hr{border:solid #ddddd8;border-width:1px 0 0;clear:both;margin:1.25em 0 1.1875em;height:0}
</style>
</head>
<body>
"""
  preamble &= fmt"<h1>{page_title}</h1>"
  preamble &= fmt"<p>Generated {getDateStr()}</p>"
  preamble &= """
<p> See <a href="https://manithree.gitlab.io/utfastatutes/">https://manithree.gitlab.io/utfastatutes/</a> for  the latest version, or
<a href="https://gitlab.com/manithree/utfastatutes">https://gitlab.com/manithree/utfastatutes</a> to report defects or make suggestions.</p>
<hr/>
"""
  var post = """
</body>
</html>
"""
  writeFile(fileName, preamble & htmlToc & htmlBody & post )

proc fixLinks(htmlToc: var string, htmlBody: var string, faLinks: var StringTableRef) =
  # this is where the external links are fixed up if I can figure out how
  # to reliably link to le.utah.gov
  echo "Fixing links"
  for line in splitlines(htmlBody):
    # TODO this only finds the first occurence
    var m = line.find(re(r"href=""([^""]+)" & "\"" & r".?"))
    if isSome(m):
      #echo m.get.captures[0][2..^1]
      let statute = m.get.captures[0][2..^1]
      if faLinks[statute] != statute:
        echo "Needs fixing: " & statute



when isMainModule:
  var htmlToc = ""
  var htmlBody = ""
  var faLinks = newStringTable()

  # Read the json config file
  let settings = parseJson(readFile(paramStr(1)))
  let codes = settings["code"]
  let title = settings["title"].getStr()
  let fileName = settings["filename"].getStr()

  var client = newHttpClient()
  for code in codes:
    var pg = client.getContent(code.getStr())
    # couldn't get multi-line regex to work, but this is probably more
    # efficient, anyway:
    var version = ""
    for line in splitlines(pg):
      var m = line.match(re"var versionDefault=""(.*)"";")
      if isSome(m):
        version = m.get.captures[0]
        break

    # download the base versioned file
    var urlBase =code.getStr()[0..rfind(code.getStr(), '/')]

    var statute = client.getContent(urlBase & "/" & version & "." & settings["download_format"].getStr())
    # save the file (as downloaded)
    writeFile(version & "." & settings["download_format"].getStr(), statute)

    if settings["download_format"].getStr() == "xml" and
       settings["output_format"].getStr() == "html":
      convertXmlToHtml(statute, htmlToc, htmlBody, faLinks)

  if settings["download_format"].getStr() == "xml" and
     settings["output_format"].getStr() == "html":
    fixLinks(htmlToc, htmlBody, faLinks)
    writeHtml(fileName, htmlToc, htmlBody, title)