C ++中的简单词法分析器

几年来我都没有用C ++编写任何东西，所以我既忘记了很多东西，也没有接触过现代C ++。我正在研究一种玩具编程语言，而其他部分虽然很大，所以很难复习，但词法分析器相当孤立。对于如何改进代码，我将不胜感激。

lexer.h

#ifndef LEXER_H
#define LEXER_H

#include "common.h"
#include "compilation_context.h"
#include "lexer_common.h"

#include <deque>
#include <fstream>
#include <iostream>

namespace schwifty {

struct Token {
    enum class Type {
        eof = 1,
        eol = 2,
        indentation = 3,
        identifier = 4,
        def = 5,
        symbol = 6,
        string = 7,
        class_ = 8,
        import = 9,
        if_ = 10,
        int_ = 11,
        return_ = 12,
        type_identifier = 13,
        extern_ = 14,
        own = 15,
        else_ = 16,
        elif = 17,
        while_ = 18,
        mut = 19,
        and_ = 20,
        or_ = 21
    };

    explicit Token(Type type, const LexingContext& lexing_context)
        : type(type), integer(0), lexing_context(lexing_context) {
        CHECK(type != Type::symbol);
        CHECK(type != Type::int_);
    }

    explicit Token(Type type, const string& value,
                   const LexingContext& lexing_context)
        : type(type), value(value), integer(0), lexing_context(lexing_context) {
        CHECK(!value.empty());
    }

    explicit Token(int integer, const LexingContext& lexing_context)
        : type(Type::int_),
          value(""),
          integer(integer),
          lexing_context(lexing_context) {}

    explicit Token(const Token& token)
        : type(token.type),
          value(token.value),
          integer(token.integer),
          lexing_context(token.lexing_context) {}

    bool is_symbol(const string& expected_symbol = "") const {
        return type == Type::symbol &&
               (expected_symbol.empty() || value == expected_symbol);
    }

    string type_to_string() const {
        switch (type) {
            case Type::eof:
                return "token_eof";
            case Type::eol:
                return "token_eol";
            case Type::indentation:
                return "token_indentation";
            case Type::identifier:
                return "token_identifier";
            case Type::def:
                return "token_def";
            case Type::symbol:
                return "token_symbol";
            case Type::string:
                return "token_string";
            case Type::class_:
                return "token_class";
            case Type::import:
                return "token_import";
            case Type::if_:
                return "token_if";
            case Type::int_:
                return "token_int";
            case Type::return_:
                return "token_return";
            case Type::type_identifier:
                return "token_type_identifier";
            case Type::extern_:
                return "extern";
            case Type::own:
                return "own";
            case Type::else_:
                return "else";
            case Type::elif:
                return "elif";
            case Type::while_:
                return "while";
            default:
                return std::to_string(static_cast<int>(type));
        }
    }

    string to_string() const {
        return "type: " + type_to_string() +
               (value.empty() ? "" : ", value: " + value) +
               (type == Type::int_
                      ? (string(", integer: ") + std::to_string(integer))
                      : "") +
               "\n " + lexing_context.to_string();
    };

    string to_string_for_error() const {
        switch (type) {
            case Type::int_:
                return std::to_string(integer);
            default:
                return value;
        }
    }

    const Type type;
    const string value;
    const int integer;
    const LexingContext lexing_context;
};

bool operator==(const Token& first, const Token& second);

class InputSource {
public:
    virtual string get_file_name() = 0;

    virtual bool has_next_file() = 0;

    bool open_next_file() {
        line_no_ = 0;
        return do_open_next_file();
    }

    virtual bool do_open_next_file() = 0;

    bool get_line(string* line) {
        bool result = do_get_line(line);
        if (result) {
            line_no_++;
        }
        return result;
    }

    virtual bool do_get_line(string* line) = 0;

    int get_line_no() { return line_no_; }

protected:
    int line_no_ = 0;
};

class StandardInputSource : public InputSource {
public:
    string get_file_name() override { return "<stdin>"; }

    bool has_next_file() override { return !started_reading_; }

    bool do_open_next_file() override { return !started_reading_; }

    bool do_get_line(string* line) override;

private:
    bool started_reading_ = false;
};

class FileInputSource : public InputSource {
private:
    int current_ = 0;
    std::ifstream input_stream_;
    vector<string> file_names_;

public:
    FileInputSource() {}
    ~FileInputSource() {
        if (input_stream_.is_open()) {
            input_stream_.close();
        }
    }

    void add_file(const string& path) { file_names_.push_back(path); }

    bool do_open_next_file() override {
        if (input_stream_.is_open()) {
            input_stream_.close();
        }
        if (current_ >= file_names_.size()) {
            return false;
        }
        input_stream_.open(file_names_[current_]);
        current_++;
        return (bool) input_stream_;
    }

    bool do_get_line(string* line) override {
        if (!input_stream_) {
            return false;
        }
        return (bool) std::getline(input_stream_, *line);
    }

    bool has_next_file() override { return current_ < file_names_.size(); }

    string get_file_name() override {
        if (current_ < 0 || current_ > file_names_.size()) {
            return "";
        }
        return file_names_[current_ - 1];
    }
};

class Lexer {
public:
    Lexer(CompilationContext& context, InputSource* source);

    const Token* get_token();

    const Token* peek_token();

    const Token* peek_token_skip_indentation();

    const int peek_indentation();

private:
    CompilationContext& context_;
    int line_counter_;
    string current_line_;
    InputSource* source_;
    std::deque<const Token*> tokens_;

    bool parse_line();

    bool read_line();

    LexingContext create_lexing_context();

    void push_symbol(const string& symbol);

    bool is_a_number(int index);

    std::set<char> single_char_symbols_;
};

}  // namespace schwifty

#endif  // LEXER_H

lexer.cc

#include "lexer.h"

#include <iostream>

#include "errors.h"
#include "utils.h"

namespace schwifty {

const static int INDENT_SPACES = 4;

bool StandardInputSource::do_get_line(string* line) {
    if (!started_reading_) {
        return false;
    }
    return (bool) getline(std::cin, *line);
}

Lexer::Lexer(CompilationContext& context, InputSource* source)
    : context_(context), line_counter_(0), source_(source) {
    single_char_symbols_.insert('(');
    single_char_symbols_.insert(')');
    single_char_symbols_.insert('{');
    single_char_symbols_.insert('}');
    single_char_symbols_.insert('>');
    single_char_symbols_.insert('<');
    single_char_symbols_.insert('-');
    single_char_symbols_.insert('+');
    single_char_symbols_.insert('*');
    single_char_symbols_.insert('/');
    single_char_symbols_.insert(':');
    single_char_symbols_.insert(',');
    single_char_symbols_.insert('.');
    single_char_symbols_.insert('[');
    single_char_symbols_.insert(']');
}

const Token* Lexer::get_token() {
    if (tokens_.empty()) {
        if (!parse_line()) {
            return new Token(Token::Type::eof, create_lexing_context());
        }
    }
    const Token* result = tokens_.front();
    tokens_.pop_front();
    VLOG(3) << result->to_string();
    return result;
}

const int Lexer::peek_indentation() {
    if (tokens_.empty()) {
        if (!parse_line()) {
            return 0;
        }
    }
    int i = 0;
    for (; i < tokens_.size() && tokens_[i]->type == Token::Type::indentation;
         i++)
        ;
    return i;
}

const Token* Lexer::peek_token() {
    if (tokens_.empty()) {
        if (!parse_line()) {
            return new Token(Token::Type::eof, create_lexing_context());
        }
    }
    return tokens_.front();
}

const Token* Lexer::peek_token_skip_indentation() {
    if (tokens_.empty()) {
        if (!parse_line()) {
            return new Token(Token::Type::eof, create_lexing_context());
        }
    }
    int i = 0;
    for (; i < tokens_.size() && tokens_[i]->type == Token::Type::indentation;
         i++)
        ;
    CHECK(i < tokens_.size())
          << utils::join(utils::to_string_ptr(tokens_), "\n");
    return tokens_[i];
}

bool Lexer::read_line() {
    if (source_->get_line(&current_line_)) {
        return true;
    }
    return source_->has_next_file() && source_->open_next_file() &&
           source_->get_line(&current_line_);
};

bool Lexer::parse_line() {
    if (!read_line()) {
        return false;
    }
    int i = 0;
    while (isspace(current_line_[i])) {
        if (current_line_[i] != ' ') {
            context_.add_error(make_unique<Error>(
                  Error::INDENTATION_ONLY_SPACES, source_->get_file_name(),
                  current_line_, source_->get_line_no(), false));
        }
        i++;
    }
    if (i % INDENT_SPACES) {
        std::cout << "line " << line_counter_
                  << ": one indent is always 4 spaces long. Treating " << i
                  << " spaces as " << i / INDENT_SPACES + 1 << "indents";
    }
    int indents = (i + INDENT_SPACES - 1) / INDENT_SPACES;
    for (int j = 0; j < indents; j++) {
        tokens_.push_back(
              new Token(Token::Type::indentation, create_lexing_context()));
    }
    while (i < current_line_.size()) {
        if (isalpha(current_line_[i])) {
            const int start = i;
            while (i < current_line_.size() &&
                   (isalnum(current_line_[i]) || current_line_[i] == '_')) {
                i++;
            }
            string identifier = current_line_.substr(start, i - start);
            if (identifier == "def") {
                tokens_.push_back(
                      new Token(Token::Type::def, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "class") {
                tokens_.push_back(
                      new Token(Token::Type::class_, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "import") {
                tokens_.push_back(
                      new Token(Token::Type::import, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "if") {
                tokens_.push_back(
                      new Token(Token::Type::if_, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "return") {
                tokens_.push_back(
                      new Token(Token::Type::return_, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "extern") {
                tokens_.push_back(
                      new Token(Token::Type::extern_, create_lexing_context()));
                // no space afterwards
            } else if (identifier == "own") {
                tokens_.push_back(
                      new Token(Token::Type::own, create_lexing_context()));
                i++;  // space aftwards
            } else if (identifier == "elif") {
                tokens_.push_back(
                      new Token(Token::Type::elif, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "else") {
                tokens_.push_back(
                      new Token(Token::Type::else_, create_lexing_context()));
                // no space afterwards
            } else if (identifier == "while") {
                tokens_.push_back(
                      new Token(Token::Type::while_, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "mut") {
                tokens_.push_back(
                      new Token(Token::Type::mut, create_lexing_context()));
                i++;  // space afterwards
            } else if (identifier == "and") {
                tokens_.push_back(
                      new Token(Token::Type::and_, create_lexing_context()));
                i++;
            } else if (identifier == "or") {
                tokens_.push_back(
                      new Token(Token::Type::or_, create_lexing_context()));
                i++;
            } else {
                if (isupper(identifier[0])) {
                    tokens_.push_back(new Token(Token::Type::type_identifier,
                                                identifier,
                                                create_lexing_context()));
                } else {
                    tokens_.push_back(new Token(Token::Type::identifier,
                                                identifier,
                                                create_lexing_context()));
                }
            }
        } else if (is_a_number(i)) {
            // Not alpha, but num. Must be a number.
            const int start = i;
            do {
                i++;
            } while (i < current_line_.size() && isdigit(current_line_[i]));
            tokens_.push_back(
                  new Token(std::stoi(current_line_.substr(start, i - start)),
                            create_lexing_context()));
        } else if (current_line_[i] == '\'' || current_line_[i] == '"') {
            const char mark = current_line_[i];
            i++;
            int start = i;
            while (current_line_[i] != mark) {
                i++;
            }
            tokens_.push_back(new Token(Token::Type::string,
                                        current_line_.substr(start, i - start),
                                        create_lexing_context()));
            i++;
        } else {
            if (current_line_[i] == ' ') {
                i++;
            } else {
                if (utils::contains(single_char_symbols_, current_line_[i])) {
                    push_symbol(string(1, current_line_[i]));
                    i++;
                } else if (current_line_[i] == '!' &&
                           i + 1 < current_line_.size() &&
                           current_line_[i + 1] == '=') {
                    push_symbol("!=");
                    i += 2;
                } else if (current_line_[i] == '=') {
                    if (i + 1 < current_line_.size() &&
                        current_line_[i + 1] == '=') {
                        push_symbol("==");
                        i += 2;
                    } else {
                        push_symbol("=");
                        i++;
                    }
                } else {
                    context_.add_error(make_unique<Error>(
                          Error::UNRECOGNIZED_SYMBOL, source_->get_file_name(),
                          current_line_, source_->get_line_no(), false));
                    i++;
                }
            }
        }
    }
    tokens_.push_back(new Token(Token::Type::eol, create_lexing_context()));
    return true;
}

void Lexer::push_symbol(const string& symbol) {
    tokens_.push_back(
          new Token(Token::Type::symbol, symbol, create_lexing_context()));
}

LexingContext Lexer::create_lexing_context() {
    LexingContext context(source_->get_file_name(), current_line_,
                          source_->get_line_no());
    return context;
}

bool Lexer::is_a_number(int index) {
    return isdigit(current_line_[index]) ||
           (current_line_[index] == '-' && index + 1 < current_line_.size() &&
            isdigit(current_line_[index + 1]));
}

bool operator==(const Token& first, const Token& second) {
    return first.type == second.type && first.value == second.value &&
           first.integer == second.integer;
};
}  // namespace schwifty

小心：此代码正在各处泄漏内存。

我对算法的主要评价，并在非常快速地阅读了代码之后：您应该使用DFA，它比进行比较要快，而且这是词法分析器的最新技术（请参阅* lex系列的例如词法生成器）。

Synxis：TBH目前还不确定他们是否在讲玩具语言。

康拉德·鲁道夫（Konrad Rudolph）：您能给我一些发现泄漏的提示吗？

是时候变胖了吗？

#1 楼

除了弗兰克（Frank）的注释（有效）之外：

C ++注释

可能的消息腐烂

if (i % INDENT_SPACES) {
    std::cout << "line " << line_counter_
              << ": one indent is always 4 spaces long. Treating " << i
              << " spaces as " << i / INDENT_SPACES + 1 << "indents";
}
int indents = (i + INDENT_SPACES - 1) / INDENT_SPACES;

在这里，您对indents进行了两次计算，根据经验，它们往往会不同步。这里的解决方法很简单：只需交换行并在打印时使用indents变量即可。

不返回const int

这没有任何意义（并且不可执行）。只需返回int ...

使用较少的动态分配

context_.add_error(make_unique<Error>( /* ... */ ));

这将导致包含错误负载的源的损失，并且添加不必要的分配。我认为在这种情况下，您可以直接使用值（弗兰克已经说过）。

关于算法

常规

从阅读代码，看来您的语言不是自由格式的（也就是说，缩进和换行很重要）。我不知道是否需要这样做，但是它完全禁止制表。

总的来说，词法分析器可以看作是非常简单的编译器，将字符串作为输入，并输出一个词素数组，通常通常由枚举值确定（标识符除外）。由于它们只是入口-而且您希望您的语言标记易于被人类区分-词法分析器的“语言”也应该很简单。这就是为什么词法分析器的“语言”通常是常规语言的原因之一。
重要：只有词法分析器的“语言”是常规的，您的玩具语言可能不是常规的（但这无关紧要，因为它是一个单独的编译阶段）。

必然结果是，要有效地解析常规语言，您需要DFA。您可以手工编写（很乏味，但很适合学习），也可以为此使用工具。

更多的模块化代码

首先，我将把缩进本身的功能：

void parse_indentation()
{
    int i = 0;
    while (isspace(current_line_[i]))
    {
        if (current_line_[i] != ' ')
        {
            context_.add_error(make_unique<Error>(
                  Error::INDENTATION_ONLY_SPACES, source_->get_file_name(),
                  current_line_, source_->get_line_no(), false));
        }
        i++;
    }
    int indents = (i + INDENT_SPACES - 1) / INDENT_SPACES;
    if (i % INDENT_SPACES)
        std::cout << "line " << line_counter_
                  << ": one indent is always 4 spaces long. Treating " << i
                  << " spaces as " << indents << "indents";

    for (int j = 0; j < indents; j++)
        tokens_.push_back(new Token(Token::Type::indentation, create_lexing_context()));
}

顺便说一句，您对while (isspace(current_line_[i]))所做的操作很好：您允许更多（即，即使不出现选项卡和\ r也是如此），因此您的错误报告会更好。

确定性有限自动机

现在，可以在DFA中更改剩余的while循环。这是一个数字，省略号，其他字符和带单引号的字符串的小示例：

显然，对于无法识别的字符，存在从K *状态到Error的链接。，但是出于可读性考虑，我将它们部分隐藏了。
一旦拥有自动机，您就会尝试将状态推进到无法达到的状态（达到最终状态或达到错误状态）。停止之前您所拥有的就是您想要的。

只需重复几次，直到整个行转换为词素即可。

其他

尽量避免沉重的令牌。如果可以使用std::string_view（c ++ 17），请这样做，因为std::string会进行动态分配（这会花费很多！）

\ $ \ begingroup \ $
非常感谢您的详细解释！我的理解是lex / flex会为我生成代码，但是我认为不可能的原因之一是因为空格在我编写的语言中很重要（而我的理解是lex / flex认为空格并不重要）。我是否正确理解您的建议：首先解析缩进，然后运行DFA，直到行尾？
\ $ \ endgroup \ $
– gruszczy
17年11月6日在20:54

\ $ \ begingroup \ $
从您的代码中，您只关心缩进和换行符。因此，首先解析缩进（这样就完成了），然后解析令牌直到结尾，然后解析换行符。是的，大量的空格对于flex来说是棘手的，但是您可以实现它们：matt.might.net/articles/standalone-lexers-with-lex（尽管我自己从来没有尝试过）。我仍然认为您自己做所有这些都会学到更多。
\ $ \ endgroup \ $
– Synxis
17年11月6日在21:47

#2 楼

辛苦了！

经过快速的第一步，这是我发现的主要物品，而没有特别的顺序。我敢肯定还有更多可以找到的，但是这些应该可以大大改善您的代码：

显式使用过度的

编辑：请参见下面的评论，这在以下情况中不正确每个场景。

explicit仅对带有单个非默认参数的构造函数有用。将其添加到具有更多参数的构造函数中是没有用的。

指针的怪异用法

const Token* peek_token();

在这里用指针返回没有意义。这并不是像您期望有人存储对令牌的长期引用那样。您应该改为通过引用返回。

请勿使用原始指针

std::deque<const Token*> tokens_;

应为

std::deque<std::unique_ptr<const Token>> tokens_;

没关系，因为：

不要在不必要的地方使用动态内存

std::deque<std::unique_ptr<const Token>> tokens_;

be：

std::deque<const Token> tokens_;

STL容器已经将对象存储在堆中，并且在管理它们方面做得很好，除非它们具有可笑的庞大sizeof()（这是超级可疑的）首先）。与通过稍微快一些的容器大小调整所获得的性能相比，在像疯狂的人那样的内存系统上浪费的性能要多得多，而在缓存性能方面却要差得多。

说到调整大小...

双端队列用于快速插入前面，而不是快速调整大小。

您不要在push_front()上使用tokens_，因此在此不应该使用双端队列。再一次，容器大小调整的边际增加几乎可以肯定不值得降低缓存性能。 std::vector<>的push_back()的摊销时间为O（1），就可以了。

InputSource设计是狡猾的

很难说，因为InputSource似乎是为了适应此处未显示的代码，但是通常任何“继承扩展”设计都是可疑的。

真的，据我所知，您的词法分析器可能应该仅在std::istream上运行，并让调用代码处理来自不同文件的串联令牌流。这会减少对词法分析器的责任，而只对Lex做事。

if if if block会很丑：

  if (identifier == "def") {
    tokens_.push_back(
      new Token(Token::Type::def, create_lexing_context()));
    i++;  // space afterwards
  } else if (identifier == "class") {
    tokens_.push_back(
      new Token(Token::Type::class_, create_lexing_context()));
    i++;  // space afterwards
  } else if (identifier == "import") {
...

可以简单地是：

std::unordered_map<std::string, Token::Type> tokens_;
...
auto found_token = tokens_.find(identifier);
if(found_token != tokens_.end()) {
  tokens_.push_back(
      new Token(found_token->second, create_lexing_context()));
    i++;  // space afterwards
}

\ $ \ begingroup \ $
“ explicit仅对具有单个非默认参数的构造函数有用”，不是true-它确定是否可以从初始化列表进行隐式转换。例如给定结构S {S（int，int）; };无效fn（S）;您可以调用fn（{3,3}），但是如果该构造函数是显式的，则不会编译（但fn（S {3,3}）可以）。
\ $ \ endgroup \ $
– Arthur Tacca
17年11月6日在13:27

\ $ \ begingroup \ $
非常感谢您的宝贵意见，我将开始应用所做的更改！我对使用双端队列有疑问：我正在代码中执行pop_front。 Vector专门没有pop_front，所以我必须使用擦除。但是我的理解是，这将需要按顺序移动所有其他元素。这就是为什么要使用双端队列的原因，因为尽管它总是一个完美的FIFO（我在这里使用），但我始终如此。
\ $ \ endgroup \ $
– gruszczy
17年11月6日在16:59

#3 楼

如果我可以提出建议，尽管编写自己的解析器很有意义，但是可以使用解析器生成器来实现新语言，这些语言可以帮助您专注于最终应用程序，而不是大量实现细节。我已经成功使用了ANTLR。它们具有C ++的绑定，可能恰好满足您的需求。如果您选择此路线，我还将建议您从Pragmatic Programmers获得《 Antlr参考手册》。在几天之内，您应该能够实现所需的内容。

\ $ \ begingroup \ $
感谢您的建议！我当时正在考虑使用lex / yacc或flex / bison，但是由于这是一个玩具项目，而且我想学习，因此我想自己实现编译的所有阶段。
\ $ \ endgroup \ $
– gruszczy
17年11月6日在17:01

#4 楼

不要将成员字段设为const。这在您复制对象时会产生问题。只需将const放在字段上，然后在需要查看的地方传递const ref。

您对缩进（一次只能有一个空格和4个）和空格进行非常严格的声明。相反，您可以遵循python的方案：如果新行较长，则前缀必须与前一行的缩进完全相同（将当前值推入堆栈）。如果更短，则在缩进堆栈中向后看，直到找到匹配项并弹出直到该点为止。缩进令牌的数量等于堆栈的大小。

您无法在字符串文字中表达转义字符。

我还将考虑在每个标记的行中添加偏移量。对于用户而言，将错误归结为正确的令牌比仅使用行要容易得多。特别是当他们想要钝的单线纸时。

\ $ \ begingroup \ $
感谢您的建议！您是否知道是否存在用于解析字符串文字中转义字符的库？这是我绝对希望拥有的功能，但我想尝试避免自己编码。
\ $ \ endgroup \ $
– gruszczy
17年7月7日在1:10

\ $ \ begingroup \ $
@gruszczy就像当前char是`\`一样简单，然后特别对待下一个char，n表示换行符，t表示制表符，u表示后面的数字是unicode值，等等。
\ $ \ endgroup \ $
–棘轮怪胎
17年7月7日在9:17

#5 楼

使用较少的动态分配
使用映射存储令牌
OtherWise很好的代码

\ $ \ begingroup \ $
使用shared_ptrs和and继承对代码进行模板化，并使用内联或原子类型来优化代码以获得更好的执行结果
\ $ \ endgroup \ $
–sameraze agvvl
20-2-22在10:39

编程黑洞网