Compiler projects using llvm
//===- lib/DebugInfo/Symbolize/Markup.cpp ------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file defines the log symbolizer markup data model and parser.
///
//===----------------------------------------------------------------------===//

#include "llvm/DebugInfo/Symbolize/Markup.h"

#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"

namespace llvm {
namespace symbolize {

// Matches the following:
//   "\033[0m"
//   "\033[1m"
//   "\033[30m" -- "\033[37m"
static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m";

MarkupParser::MarkupParser(StringSet<> MultilineTags)
    : MultilineTags(std::move(MultilineTags)), SGRSyntax(SGRSyntaxStr) {}

static StringRef takeTo(StringRef Str, StringRef::iterator Pos) {
  return Str.take_front(Pos - Str.begin());
}
static void advanceTo(StringRef &Str, StringRef::iterator Pos) {
  Str = Str.drop_front(Pos - Str.begin());
}

void MarkupParser::parseLine(StringRef Line) {
  Buffer.clear();
  NextIdx = 0;
  FinishedMultiline.clear();
  this->Line = Line;
}

Optional<MarkupNode> MarkupParser::nextNode() {
  // Pull something out of the buffer if possible.
  if (!Buffer.empty()) {
    if (NextIdx < Buffer.size())
      return std::move(Buffer[NextIdx++]);
    NextIdx = 0;
    Buffer.clear();
  }

  // The buffer is empty, so parse the next bit of the line.

  if (Line.empty())
    return None;

  if (!InProgressMultiline.empty()) {
    if (Optional<StringRef> MultilineEnd = parseMultiLineEnd(Line)) {
      llvm::append_range(InProgressMultiline, *MultilineEnd);
      assert(FinishedMultiline.empty() &&
             "At most one multi-line element can be finished at a time.");
      FinishedMultiline.swap(InProgressMultiline);
      // Parse the multi-line element as if it were contiguous.
      advanceTo(Line, MultilineEnd->end());
      return *parseElement(FinishedMultiline);
    }

    // The whole line is part of the multi-line element.
    llvm::append_range(InProgressMultiline, Line);
    Line = Line.drop_front(Line.size());
    return None;
  }

  // Find the first valid markup element, if any.
  if (Optional<MarkupNode> Element = parseElement(Line)) {
    parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
    Buffer.push_back(std::move(*Element));
    advanceTo(Line, Element->Text.end());
    return nextNode();
  }

  // Since there were no valid elements remaining, see if the line opens a
  // multi-line element.
  if (Optional<StringRef> MultilineBegin = parseMultiLineBegin(Line)) {
    // Emit any text before the element.
    parseTextOutsideMarkup(takeTo(Line, MultilineBegin->begin()));

    // Begin recording the multi-line element.
    llvm::append_range(InProgressMultiline, *MultilineBegin);
    Line = Line.drop_front(Line.size());
    return nextNode();
  }

  // The line doesn't contain any more markup elements, so emit it as text.
  parseTextOutsideMarkup(Line);
  Line = Line.drop_front(Line.size());
  return nextNode();
}

void MarkupParser::flush() {
  Buffer.clear();
  NextIdx = 0;
  Line = {};
  if (InProgressMultiline.empty())
    return;
  FinishedMultiline.swap(InProgressMultiline);
  parseTextOutsideMarkup(FinishedMultiline);
}

// Finds and returns the next valid markup element in the given line. Returns
// None if the line contains no valid elements.
Optional<MarkupNode> MarkupParser::parseElement(StringRef Line) {
  while (true) {
    // Find next element using begin and end markers.
    size_t BeginPos = Line.find("{{{");
    if (BeginPos == StringRef::npos)
      return None;
    size_t EndPos = Line.find("}}}", BeginPos + 3);
    if (EndPos == StringRef::npos)
      return None;
    EndPos += 3;
    MarkupNode Element;
    Element.Text = Line.slice(BeginPos, EndPos);
    Line = Line.substr(EndPos);

    // Parse tag.
    StringRef Content = Element.Text.drop_front(3).drop_back(3);
    StringRef FieldsContent;
    std::tie(Element.Tag, FieldsContent) = Content.split(':');
    if (Element.Tag.empty())
      continue;

    // Parse fields.
    if (!FieldsContent.empty())
      FieldsContent.split(Element.Fields, ":");
    else if (Content.back() == ':')
      Element.Fields.push_back(FieldsContent);

    return Element;
  }
}

static MarkupNode textNode(StringRef Text) {
  MarkupNode Node;
  Node.Text = Text;
  return Node;
}

// Parses a region of text known to be outside any markup elements. Such text
// may still contain SGR control codes, so the region is further subdivided into
// control codes and true text regions.
void MarkupParser::parseTextOutsideMarkup(StringRef Text) {
  if (Text.empty())
    return;
  SmallVector<StringRef> Matches;
  while (SGRSyntax.match(Text, &Matches)) {
    // Emit any text before the SGR element.
    if (Matches.begin()->begin() != Text.begin())
      Buffer.push_back(textNode(takeTo(Text, Matches.begin()->begin())));

    Buffer.push_back(textNode(*Matches.begin()));
    advanceTo(Text, Matches.begin()->end());
  }
  if (!Text.empty())
    Buffer.push_back(textNode(Text));
}

// Given that a line doesn't contain any valid markup, see if it ends with the
// start of a multi-line element. If so, returns the beginning.
Optional<StringRef> MarkupParser::parseMultiLineBegin(StringRef Line) {
  // A multi-line begin marker must be the last one on the line.
  size_t BeginPos = Line.rfind("{{{");
  if (BeginPos == StringRef::npos)
    return None;
  size_t BeginTagPos = BeginPos + 3;

  // If there are any end markers afterwards, the begin marker cannot belong to
  // a multi-line element.
  size_t EndPos = Line.find("}}}", BeginTagPos);
  if (EndPos != StringRef::npos)
    return None;

  // Check whether the tag is registered multi-line.
  size_t EndTagPos = Line.find(':', BeginTagPos);
  if (EndTagPos == StringRef::npos)
    return None;
  StringRef Tag = Line.slice(BeginTagPos, EndTagPos);
  if (!MultilineTags.contains(Tag))
    return None;
  return Line.substr(BeginPos);
}

// See if the line begins with the ending of an in-progress multi-line element.
// If so, return the ending.
Optional<StringRef> MarkupParser::parseMultiLineEnd(StringRef Line) {
  size_t EndPos = Line.find("}}}");
  if (EndPos == StringRef::npos)
    return None;
  return Line.take_front(EndPos + 3);
}

} // end namespace symbolize
} // end namespace llvm