//===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains the main building blocks of macro support in
/// clang-format.
///
/// In order to not violate the requirement that clang-format can format files
/// in isolation, clang-format's macro support uses expansions users provide
/// as part of clang-format's style configuration.
///
/// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
/// one level of expansion (\see MacroExpander for a full description of what
/// is supported).
///
/// As part of parsing, clang-format uses the MacroExpander to expand the
/// spelled token streams into expanded token streams when it encounters a
/// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
/// from the expanded token stream.
/// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
/// matches the spelled token stream into unwrapped lines that best resemble the
/// structure of the expanded unwrapped lines. These reconstructed unwrapped
/// lines are aliasing the tokens in the expanded token stream, so that token
/// annotations will be reused when formatting the spelled macro calls.
///
/// When formatting, clang-format annotates and formats the expanded unwrapped
/// lines first, determining the token types. Next, it formats the spelled
/// unwrapped lines, keeping the token types fixed, while allowing other
/// formatting decisions to change.
///
//===----------------------------------------------------------------------===//
#ifndef CLANG_LIB_FORMAT_MACROS_H
#define CLANG_LIB_FORMAT_MACROS_H
#include <list>
#include <map>
#include <string>
#include <vector>
#include "FormatToken.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
namespace clang {
namespace format {
struct UnwrappedLine;
struct UnwrappedLineNode;
/// Takes a set of macro definitions as strings and allows expanding calls to
/// those macros.
///
/// For example:
/// Definition: A(x, y)=x + y
/// Call : A(int a = 1, 2)
/// Expansion : int a = 1 + 2
///
/// Expansion does not check arity of the definition.
/// If fewer arguments than expected are provided, the remaining parameters
/// are considered empty:
/// Call : A(a)
/// Expansion: a +
/// If more arguments than expected are provided, they will be discarded.
///
/// The expander does not support:
/// - recursive expansion
/// - stringification
/// - concatenation
/// - variadic macros
///
/// Furthermore, only a single expansion of each macro argument is supported,
/// so that we cannot get conflicting formatting decisions from different
/// expansions.
/// Definition: A(x)=x+x
/// Call : A(id)
/// Expansion : id+x
///
class MacroExpander {
public:
using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>;
/// Construct a macro expander from a set of macro definitions.
/// Macro definitions must be encoded as UTF-8.
///
/// Each entry in \p Macros must conform to the following simple
/// macro-definition language:
/// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
/// <params> ::= <id-list> | ""
/// <id-list> ::= <id> | <id> "," <params>
/// <expansion> ::= "=" <tail> | <eof>
/// <tail> ::= <tok> <tail> | <eof>
///
/// Macros that cannot be parsed will be silently discarded.
///
MacroExpander(const std::vector<std::string> &Macros,
clang::SourceManager &SourceMgr, const FormatStyle &Style,
llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
IdentifierTable &IdentTable);
~MacroExpander();
/// Returns whether a macro \p Name is defined.
bool defined(llvm::StringRef Name) const;
/// Returns whether the macro has no arguments and should not consume
/// subsequent parentheses.
bool objectLike(llvm::StringRef Name) const;
/// Returns the expanded stream of format tokens for \p ID, where
/// each element in \p Args is a positional argument to the macro call.
llvm::SmallVector<FormatToken *, 8> expand(FormatToken *ID,
ArgsList Args) const;
private:
struct Definition;
class DefinitionParser;
void parseDefinition(const std::string &Macro);
clang::SourceManager &SourceMgr;
const FormatStyle &Style;
llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
IdentifierTable &IdentTable;
SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers;
llvm::StringMap<Definition> Definitions;
};
/// Converts a sequence of UnwrappedLines containing expanded macros into a
/// single UnwrappedLine containing the macro calls. This UnwrappedLine may be
/// broken into child lines, in a way that best conveys the structure of the
/// expanded code.
///
/// In the simplest case, a spelled UnwrappedLine contains one macro, and after
/// expanding it we have one expanded UnwrappedLine. In general, macro
/// expansions can span UnwrappedLines, and multiple macros can contribute
/// tokens to the same line. We keep consuming expanded lines until:
/// * all expansions that started have finished (we're not chopping any macros
/// in half)
/// * *and* we've reached the end of a *spelled* unwrapped line.
///
/// A single UnwrappedLine represents this chunk of code.
///
/// After this point, the state of the spelled/expanded stream is "in sync"
/// (both at the start of an UnwrappedLine, with no macros open), so the
/// Unexpander can be thrown away and parsing can continue.
///
/// Given a mapping from the macro name identifier token in the macro call
/// to the tokens of the macro call, for example:
/// CLASSA -> CLASSA({public: void x();})
///
/// When getting the formatted lines of the expansion via the \c addLine method
/// (each '->' specifies a call to \c addLine ):
/// -> class A {
/// -> public:
/// -> void x();
/// -> };
///
/// Creates the tree of unwrapped lines containing the macro call tokens so that
/// the macro call tokens fit the semantic structure of the expanded formatted
/// lines:
/// -> CLASSA({
/// -> public:
/// -> void x();
/// -> })
class MacroCallReconstructor {
public:
/// Create an Reconstructor whose resulting \p UnwrappedLine will start at
/// \p Level, using the map from name identifier token to the corresponding
/// tokens of the spelled macro call.
MacroCallReconstructor(
unsigned Level,
const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
&ActiveExpansions);
/// For the given \p Line, match all occurences of tokens expanded from a
/// macro to unwrapped lines in the spelled macro call so that the resulting
/// tree of unwrapped lines best resembles the structure of unwrapped lines
/// passed in via \c addLine.
void addLine(const UnwrappedLine &Line);
/// Check whether at the current state there is no open macro expansion
/// that needs to be processed to finish an macro call.
/// Only when \c finished() is true, \c takeResult() can be called to retrieve
/// the resulting \c UnwrappedLine.
/// If there are multiple subsequent macro calls within an unwrapped line in
/// the spelled token stream, the calling code may also continue to call
/// \c addLine() when \c finished() is true.
bool finished() const { return ActiveExpansions.empty(); }
/// Retrieve the formatted \c UnwrappedLine containing the orginal
/// macro calls, formatted according to the expanded token stream received
/// via \c addLine().
/// Generally, this line tries to have the same structure as the expanded,
/// formatted unwrapped lines handed in via \c addLine(), with the exception
/// that for multiple top-level lines, each subsequent line will be the
/// child of the last token in its predecessor. This representation is chosen
/// because it is a precondition to the formatter that we get what looks like
/// a single statement in a single \c UnwrappedLine (i.e. matching parens).
///
/// If a token in a macro argument is a child of a token in the expansion,
/// the parent will be the corresponding token in the macro call.
/// For example:
/// #define C(a, b) class C { a b
/// C(int x;, int y;)
/// would expand to
/// class C { int x; int y;
/// where in a formatted line "int x;" and "int y;" would both be new separate
/// lines.
///
/// In the result, "int x;" will be a child of the opening parenthesis in "C("
/// and "int y;" will be a child of the "," token:
/// C (
/// \- int x;
/// ,
/// \- int y;
/// )
UnwrappedLine takeResult() &&;
private:
void add(FormatToken *Token, FormatToken *ExpandedParent, bool First);
void prepareParent(FormatToken *ExpandedParent, bool First);
FormatToken *getParentInResult(FormatToken *Parent);
void reconstruct(FormatToken *Token);
void startReconstruction(FormatToken *Token);
bool reconstructActiveCallUntil(FormatToken *Token);
void endReconstruction(FormatToken *Token);
bool processNextReconstructed();
void finalize();
struct ReconstructedLine;
void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr);
UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level);
void debug(const ReconstructedLine &Line, int Level);
ReconstructedLine &parentLine();
ReconstructedLine *currentLine();
void debugParentMap() const;
#ifndef NDEBUG
enum ReconstructorState {
Start, // No macro expansion was found in the input yet.
InProgress, // During a macro reconstruction.
Finalized, // Past macro reconstruction, the result is finalized.
};
ReconstructorState State = Start;
#endif
// Node in which we build up the resulting unwrapped line; this type is
// analogous to UnwrappedLineNode.
struct LineNode {
LineNode() = default;
LineNode(FormatToken *Tok) : Tok(Tok) {}
FormatToken *Tok = nullptr;
llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children;
};
// Line in which we build up the resulting unwrapped line.
// FIXME: Investigate changing UnwrappedLine to a pointer type and using it
// instead of rolling our own type.
struct ReconstructedLine {
llvm::SmallVector<std::unique_ptr<LineNode>> Tokens;
};
// The line in which we collect the resulting reconstructed output.
// To reduce special cases in the algorithm, the first level of the line
// contains a single null token that has the reconstructed incoming
// lines as children.
// In the end, we stich the lines together so that each subsequent line
// is a child of the last token of the previous line. This is necessary
// in order to format the overall expression as a single logical line -
// if we created separate lines, we'd format them with their own top-level
// indent depending on the semantic structure, which is not desired.
ReconstructedLine Result;
// Stack of currently "open" lines, where each line's predecessor's last
// token is the parent token for that line.
llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines;
// Maps from the expanded token to the token that takes its place in the
// reconstructed token stream in terms of parent-child relationships.
// Note that it might take multiple steps to arrive at the correct
// parent in the output.
// Given: #define C(a, b) []() { a; b; }
// And a call: C(f(), g())
// The structure in the incoming formatted unwrapped line will be:
// []() {
// |- f();
// \- g();
// }
// with f and g being children of the opening brace.
// In the reconstructed call:
// C(f(), g())
// \- f()
// \- g()
// We want f to be a child of the opening parenthesis and g to be a child
// of the comma token in the macro call.
// Thus, we map
// { -> (
// and add
// ( -> ,
// once we're past the comma in the reconstruction.
llvm::DenseMap<FormatToken *, FormatToken *>
SpelledParentToReconstructedParent;
// Keeps track of a single expansion while we're reconstructing tokens it
// generated.
struct Expansion {
// The identifier token of the macro call.
FormatToken *ID;
// Our current position in the reconstruction.
std::list<UnwrappedLineNode>::iterator SpelledI;
// The end of the reconstructed token sequence.
std::list<UnwrappedLineNode>::iterator SpelledE;
};
// Stack of macro calls for which we're in the middle of an expansion.
llvm::SmallVector<Expansion> ActiveExpansions;
struct MacroCallState {
MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken,
FormatToken *MacroCallLParen);
ReconstructedLine *Line;
// The last token in the parent line or expansion, or nullptr if the macro
// expansion is on a top-level line.
//
// For example, in the macro call:
// auto f = []() { ID(1); };
// The MacroCallState for ID will have '{' as ParentLastToken.
//
// In the macro call:
// ID(ID(void f()));
// The MacroCallState of the outer ID will have nullptr as ParentLastToken,
// while the MacroCallState for the inner ID will have the '(' of the outer
// ID as ParentLastToken.
//
// In the macro call:
// ID2(a, ID(b));
// The MacroCallState of ID will have ',' as ParentLastToken.
FormatToken *ParentLastToken;
// The l_paren of this MacroCallState's macro call.
FormatToken *MacroCallLParen;
};
// Keeps track of the lines into which the opening brace/parenthesis &
// argument separating commas for each level in the macro call go in order to
// put the corresponding closing brace/parenthesis into the same line in the
// output and keep track of which parents in the expanded token stream map to
// which tokens in the reconstructed stream.
// When an opening brace/parenthesis has children, we want the structure of
// the output line to be:
// |- MACRO
// |- (
// | \- <argument>
// |- ,
// | \- <argument>
// \- )
llvm::SmallVector<MacroCallState> MacroCallStructure;
// Level the generated UnwrappedLine will be at.
const unsigned Level;
// Maps from identifier of the macro call to an unwrapped line containing
// all tokens of the macro call.
const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
&IdToReconstructed;
};
} // namespace format
} // namespace clang
#endif