protoc

以 protoc@b1c07 为基准阅读 protoc 源码。

基本结构

src/google/protobuf/compiler/

.
├── main.cc
├── code_generator.h
├── command_line_interface.cc (1)
├── importer.cc
├── parser.cc
├── plugin.h
├── plugin.proto (2)
├── retention.cc
├── subprocess.cc
├── zip_writer.cc
├── cpp
│   └── generator.h
├── csharp
│   └── csharp_generator.h
├── java
│   └── generator.h
├── objectivec
│   └── generator.h
├── php
│   └── php_generator.h
├── python
│   └── generator.h
├── ruby
│   └── ruby_generator.h
└── rust
    └── generator.h

1	命令行执行入口。
2	plugin 接口定义文件。

上面对输出结果做出了一些处理。删除了测试相关文件、实现细节文件和头文件。

plugin.proto

plugin.proto 文件对插件做出了定义和解释：

// protoc 可以通过 plugins 拓展。一个插件只是从 stdin 中读取 CodeGeneratorRequest
// 然后讲 CodeGeneratorResponse 写到 stdout。
//
// 使用 C++ 写的插件可以使用 google/protobuf/compiler/plugin.h 而不是处理 protocol
// 的原始定义。
//
// 插件应该放在 PATH 中的某个地方，名字应该为 "protoc-gen-$NAME"。然后 protoc 通过
// 传入 flag "--${NAME}_out" 使用。
//
syntax = "proto2";

package google.protobuf.compiler;
option java_package = "com.google.protobuf.compiler";
option java_outer_classname = "PluginProtos";

option csharp_namespace = "Google.Protobuf.Compiler";
option go_package = "google.golang.org/protobuf/types/pluginpb";

import "google/protobuf/descriptor.proto";

// protocol compiler 的版本号
message Version {
  optional int32 major = 1;
  optional int32 minor = 2;
  optional int32 patch = 3;
  // A suffix for alpha, beta or rc release, e.g., "alpha-1", "rc2". It should
  // be empty for mainline stable releases.
  optional string suffix = 4;
}

// 一个编码的 CodeGeneratorRequest 被写到插件的 stdin。
message CodeGeneratorRequest {
  // .proto 文件被显式在命令行中列出。code generator 应该只为这些文件生成代码。
  // 每个文件的描述符被包含在下面的 proto_file 中。
  repeated string file_to_generate = 1;

  // generator 参数通过命令行传输。
  optional string parameter = 2;

  // files_to_generate 中的所有文件及其导入的所有内容的 FileDescriptorProtos。
  // 文件将按拓扑顺序出现，因此每个文件都会出现在导入它的任何文件之前。
  //
  // NOTE: files_to_generate 中列出的文件将仅包含运行时保留选项，但所有其他文
  // 件将包含源保留选项。如果您需要 files_to_generate 的源保留选项，则可以使
  // 用下面的 source_file_descriptors 字段。
  //
  // protoc 保证所有 proto_files 都将在上述字段之后写入，尽管 protobuf 线上格
  // 式在技术上无法保证这一点。理论上，这可以允许插件在 FileDescriptorProtos
  // 中流式传输并逐个处理它们，而不是一次将整个集合读入内存。但是，截至撰写本
  // 文时，protoc 端尚未对此进行类似的优化：它会将所有字段一次性存储在内存中，
  // 然后再将它们发送到插件。
  //
  // Type names of fields and extensions in the FileDescriptorProto are always
  // fully qualified.
  repeated FileDescriptorProto proto_file = 15;

  // 带有选项的所有文件描述符。包括源码相关的选项。这些描述符用于在 files_to_generate
  // 中列出的文件。
  repeated FileDescriptorProto source_file_descriptors = 17;

  // protocol compiler 版本号。
  optional Version compiler_version = 3;
}

// 插件将编码的 CodeGeneratorResponse 写到 stdout。
message CodeGeneratorResponse {
  // 错误消息。如果不为空，则代码生成失败。即使报告了错误，
  // 插件进程也应该以状态代码零退出，

  // 这应该用于指示 .proto 文件中的错误，这些错误阻止 代码生成器生成正确
  // 的代码。指示 protoc 本身存在问题的错误（例如输入的
  // CodeGeneratorRequest 无法解析）应该通过向 stderr 写入消息并以非零状
  // 态代码退出来报告。
  optional string error = 1;

  // code generator 支持的 features。代表了 Feature enum 的按位或。
  optional uint64 supported_features = 2;

  // Sync with code_generator.h.
  enum Feature {
    FEATURE_NONE = 0;
    FEATURE_PROTO3_OPTIONAL = 1;
    FEATURE_SUPPORTS_EDITIONS = 2;
  }

  // 此插件支持的最低版本。这将被视为 Edition enum，但我们希望允许未知值。
  // 应根据 Edition 指定版本，*而不是* 版本号。仅对设置了
  // FEATURE_SUPPORTS_EDITIONS 的插件有效。
  optional int32 minimum_edition = 3;

  // 此插件支持的最高版本。
  optional int32 maximum_edition = 4;

  // 代表了单个生成的文件。
  message File {
    // 相对于输出目录的文件名。文件名不得包含 "." 和 ".." 而且只能是相对路径。
    // "/" 总是被用于路径分隔符。
    //
    // 如果 name 被省略。内容将会被附加到先前的文件中。这允许 generator 将大
    // 文件才分到小的 chunks，并允许生成的 text 流式传输到 protoc 以避免大文件
    // 一次性消耗过多内存。也就是说这仿佛 protoc 根本没有优化：在将文件写入磁盘前
    // 它将读取整个 CodeGeneratorResponse。
    optional string name = 1;

    // 如果非空，则表示文件应该已经存在，并且此处的内容将在定义的插入点插入到
    // 该文件中。此功能允许代码生成器扩展另一个代码生成器生成的输出。原始生成
    // 器可以通过在文件中放置特殊注释来提供插入点，如下所示：
    // @@protoc_insertion_point(NAME)
    // 可以在行的前后放置任意文本， 这样就可以将其放在注释中。NAME 应该替换为
    // 一个插入点的标识符：其他生成器将使用它作为插入点。在此点插入的代码将放
    // 置在包含插入点的行的正上方（因此，对同一点的多次插入将按添加顺序出现）。
    // 双 @ 旨在使生成的代码不太可能 包含意外看起来像插入点的内容。
    //
    // 比如，C++ code generator 将下面的内容放在它生成的 .pb.h 文件中：
    //   // @@protoc_insertion_point(namespace_scope)
    // 此行出现在文件的包命名空间范围内，但不在任何特定类的范围内。然后，另一个
    // 插件可以使用插入点“namespace_scope”来生成应放置在此范围内的其他类或其他声明。
    //
    // 请注意，如果包含插入点的行以空格开头，则插入文本的每一行都会添加相同的空
    // 格。这对于 Python 等语言非常有用，因为缩进很重要。在这些语言中，插入点注
    // 释的缩进量应与任何插入的代码的缩进量相同，以便在该上下文中正常工作。
    //
    // 生成初始文件的代码生成器和插入到该文件的代码生成器都必须作为对 protoc 的
    // 一次调用的一部分运行。代码生成器按照它们在命令行上出现的顺序执行。
    //
    // 若 |insertion_point| 存在，则 |name| 必须也存在。
    optional string insertion_point = 2;

    // 文件内容。
    optional string content = 15;

    // 被插入的文件内容的描述性信息。如果使用了插入点，信息会被写到生成文件的指定位置。
    optional GeneratedCodeInfo generated_code_info = 16;
  }
  repeated File file = 15;
}

插件入口

plugins.h

int PluginMain(int argc, char* argv[], const CodeGenerator* generator);

// 使用给定的 code generator 生成代码。若代码生成成功，返回 true。若生成失败，
// 使用指定的错误填充 error_msg。
bool GenerateCode(
    const CodeGeneratorRequest& request,
    const CodeGenerator& generator,
    CodeGeneratorResponse* response,
    std::string* error_msg
);

plugins.\{h, cpp} 为插件提供了便利代码。在 PluginMain 中解析命令行参数和需要解析的文件，最终调用 GenerateCode 生成代码。而 Generate Code 内容如下：

plugins.cc

bool GenerateCode(
    const CodeGeneratorRequest& request,
    const CodeGenerator& generator,
    CodeGeneratorResponse* response,
    std::string* error_msg
) {
    for(int i = 0; i < request.proto_file_size(); i++) {
        const FileDescriptor* file = pool.BuildFile(request.proto_file(i)); (1)
        if(file == nullptr) {
            return false;
        }
    }

    std::vector<const FileDescriptor*> parsed_files;
    for(int i = 0; i < request.file_to_generate_size(); i++) {
        parsed_files.push_back(pool.FindFileByName(request.file_to_generate(i)));
    }

    GeneratorResponseContext context(request.compiler_version(), response, parsed_files);
    bool succeeded = generator.GenerateAll(parsed_files, request.parameter(), &context, &error); (2)
    response->set_supported_features(generator.GetSupportedFeatures());
    // ...
    return true;
}

1	解析 proto file。
2	调用 generator 接口生成代码。

插件实现

由于 Python code generator 相对简单，因此以 python code generator 为例：

要实现 code generator，需要实现 CodeGenerator 接口：

class PROTOC_EXPORT CodeGenerator {
public:
    CodeGenerator() = default;
    // 为指定的 proto 文件生成代码。在输出目录中输出一或多个文件。
    //
    // 可以在命令行上指定要传递给生成器的参数。这旨在用于传递生成器特定的参数。
    // 如果没有给出参数，则为空。ParseGeneratorParameter（见下文）可用于在单个
    // 参数命令行标志内接受多个参数。
    //
    // 成功返回 true。否则返回 false 并填充错误信息。
    virtual bool Generate( (1)
        const FileDescriptor* file,
        const std::string& parameter,
        GeneratorContext* generator_context,
        std::string* error
    ) const = 0;

    // 为所有指定的 proto 文件生成代码
    virtual bool GenerateAll( (2)
        const std::vector<const FileDescriptor*>& files,
        const std::string& parameter,
        GeneratorContext* generator_context,
        std::string* error
    ) const;
};

1	纯虚函数。插件需要实现此函数。
2	非纯虚函数，插件不应当实现此接口。

在 Python CodeGenerator 中，Generate 实现如下：

bool Generator::Generate(
    const FileDescriptor* file,
    const std::string& parameter,
    GeneratorContext* context,
    std::string* error
) const {
    GeneratorOptions options = ParseParameter(parameter, error);
    if(!error->empty()) {
        return false;
    }
    file_ = file;

    std::string filename = GetFileName(file, ".py");

    proto_ = StripSourceRetentionOptions(*file_);
    proto_.SerializeToString(&file_descriptor_serialized_);

    std::unique_ptr<io::ZeroCopyOutputStream> output(context->Open(filename));
    io::Printer printer(output.get(), '$');
    printer_ = &printer;

    PrintTopBoilerplate();
    PrintImports();
    PrintFileDescriptor();
    printer_->Print("_globals = globals()\n");
    if(GeneratingDescriptorProto()) {
        printer_->Print("if not _descriptor._USE_C_DESCRIPTORS:\n");
        printer_->Indent();
        // Create enums before message descriptors
        PrintAllEnumsInFile(); (1)
        PrintMessageDescriptors();
        FixForeignFieldsInDescriptors();
        PrintResolvedFeatures();
        printer_->Outdent();
        printer_->Print("else:\n");
        printer_->Indent();
    }
    if(GeneratingDescriptorProto()) {
        printer_->Outdent();
    }
    std::string module_name = ModuleName(file->name());
    printer_->Print(
        "_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, '$module_name$', "
        "_globals)\n",
        "module_name", module_name
    );
    printer.Print("if not _descriptor._USE_C_DESCRIPTORS:\n");
    printer_->Indent();

    FixAllDescriptorOptions();
    SetSerializedPbInterval(proto_);

    printer_->Outdent();
    if(HasGenericServices(file)) {
        printer_->Print(
            "_builder.BuildServices(DESCRIPTOR, '$module_name$', _globals)\n",
            "module_name", module_name
        );
    }

    printer.Print("# @@protoc_insertion_point(module_scope)\n");

    return !printer.failed();
}

void Generator::PrintAllEnumsInFile() const {
    for(int i = 0; i < file_->enum_type_count(); ++i) {
        PrintEnum(*file_->enum_type(i), proto_.enum_type(i));
    }
    for(int i = 0; i < file_->message_type_count(); ++i) {
        PrintNestedEnums(*file_->message_type(i), proto_.message_type(i));
    }
}

生成 Enum。

以 Enum 为例，其实现如下：

void Generator::PrintEnum(
    const EnumDescriptor& enum_descriptor,
    const EnumDescriptorProto& proto
) const {
    absl::flat_hash_map<absl::string_view, std::string> m;
    std::string module_level_descriptor_name =
        ModuleLevelDescriptorName(enum_descriptor);

    m["descriptor_name"] = module_level_descriptor_name;
    m["name"] = enum_descriptor.name();
    m["full_name"] = enum_descriptor.full_name();
    m["file"] = kDescriptorKey;
    const char enum_descriptor_template[] =
        "$descriptor_name$ = _descriptor.EnumDescriptor(\n"
        "  name='$name$',\n"
        "  full_name='$full_name$',\n"
        "  filename=None,\n"
        "  file=$file$,\n"
        "  create_key=_descriptor._internal_create_key,\n"
        "  values=[\n";
    std::string options_string;
    proto.options().SerializeToString(&options_string);
    printer_->Print(m, enum_descriptor_template);
    printer_->Indent();
    printer_->Indent();

    for(int i = 0; i < enum_descriptor.value_count(); ++i) {
        PrintEnumValueDescriptor(*enum_descriptor.value(i), proto.value(i));
        printer_->Print(",\n");
    }

    printer_->Outdent();

    printer_->Print("],\n");
    printer_->Print("containing_type=None,\n");
    printer_->Print("serialized_options=$options_value$,\n", "options_value", OptionsValue(options_string));
    EnumDescriptorProto edp;
    printer_->Outdent();
    printer_->Print(")\n");
    printer_->Print("_sym_db.RegisterEnumDescriptor($name$)\n", "name", module_level_descriptor_name);
    printer_->Print("\n");
}

DescriptorPool

DescriptorPool 用于构建文件描述符 pool。DescriptorPool 是 protobuf 提供的解析中间层，用于解析文件及文件之间的依赖关系。通过 DescriptorPool 可以获取到文件的各种信息，包括但不限于：

文件名。
文件路径。
文件的依赖。
文件中包含的数据类型。
数据类型的字段。
字段的类型。

因此 DescriptorPool 本身的内容是交叉引用的。DescriptorPool 可以导出 FileDescriptorProto（src/google/protobuf/descriptor.proto）来进行验证。

DescriptorPool 主要接口如下：

class DescriptorPool {
public:
    DescriptorPool();

    const FileDescriptor* BuildFile(const FileDescriptorProto& proto);

    const Descriptor* FindMessageTypeByName(absl::string_view name) const;
    const FieldDescriptor* FindFieldByName(absl::string_view name) const;
    const FieldDescriptor* FindExtensionByName(absl::string_view name) const;
    const OneofDescriptor* FindOneofByName(absl::string_view name) const;
    const EnumDescriptor* FindEnumTypeByName(absl::string_view name) const;
    const EnumValueDescriptor* FindEnumValueByName(absl::string_view name) const;
    const ServiceDescriptor* FindServiceByName(absl::string_view name) const;
    const MethodDescriptor* FindMethodByName(absl::string_view name) const;

    const FieldDescriptor* FindExtensionByNumber(const Descriptor* extendee, int number) const;
    const FieldDescriptor* FindExtensionByPrintableName(
        const Descriptor* extendee, absl::string_view printable_name
    ) const;

    void FindAllExtensions(const Descriptor* extendee, std::vector<const FieldDescriptor*>* out) const;
};

BuildFile

BuildFile 的实现简化如下：

const FileDescriptor* DescriptorPool::BuildFile(
    const FileDescriptorProto& proto
) {
    DeferredValidation deferred_validation(this, nullptr);
    const FileDescriptor* file =
        DescriptorBuilder::New(this, tables_.get(), deferred_validation, nullptr)
            ->BuildFile(proto);
    if(deferred_validation.Validate()) {
        return file;
    }
}

BuildFile 中的实际工作落在了 DescriptorBuilder::BuildFile 中，此函数的实现为：

FileDescriptor* DescriptorBuilder::BuildFileImpl(
    const FileDescriptorProto& proto, internal::FlatAllocator& alloc
) {
    FileDescriptor* result = alloc.AllocateArray<FileDescriptor>(1);
    // Make sure all dependencies are loaded.
    // ...

    // Convert children.
    BUILD_ARRAY(proto, result, message_type, BuildMessage, nullptr);
    BUILD_ARRAY(proto, result, enum_type, BuildEnum, nullptr);
    BUILD_ARRAY(proto, result, service, BuildService, nullptr);
    BUILD_ARRAY(proto, result, extension, BuildExtension, nullptr);

    // Note that the following steps must occur in exactly the specified order.

    // Cross-link.
    CrossLinkFile(result, proto);

    return result;
}

四个宏分别用于读取相应的类型。BUILD_ARRAY 展开如下：

result->enum_type_count_ = proto.enum_type_size();
result->enum_types_ = alloc.AllocateArray<
    typename std ::remove_pointer<decltype(result->enum_types_)>::type>(
    proto.enum_type_size()
);
for(int i = 0; i < proto.enum_type_size(); i++) {
    BuildEnum(proto.enum_type(i), nullptr, result->enum_types_ + i, alloc);
};

以 BuildEnum 为例，实现如下：

void DescriptorBuilder::BuildEnum(const EnumDescriptorProto& proto, const Descriptor* parent, EnumDescriptor* result, internal::FlatAllocator& alloc) {
    const absl::string_view scope =
        (parent == nullptr) ? file_->package() : parent->full_name();

    result->all_names_ = AllocateNameStrings(scope, proto.name(), alloc);
    ValidateSymbolName(proto.name(), result->full_name(), proto);
    result->file_ = file_;
    result->containing_type_ = parent;
    result->is_placeholder_ = false;
    result->is_unqualified_placeholder_ = false;

    if(proto.value_size() == 0) {
        // 不允许枚举没有值。因为这意味着对于这个类型没有默认值
        AddError(result->full_name(), proto, DescriptorPool::ErrorCollector::NAME, "Enums must contain at least one value.");
    }

    BUILD_ARRAY(proto, result, value, BuildEnumValue, result); (1)
    BUILD_ARRAY(proto, result, reserved_range, BuildReservedRange, result);

    // Copy reserved names.
    int reserved_name_count = proto.reserved_name_size();
    result->reserved_name_count_ = reserved_name_count;
    result->reserved_names_ = alloc.AllocateArray<const std::string*>(reserved_name_count);
    for(int i = 0; i < reserved_name_count; ++i) {
        result->reserved_names_[i] = alloc.AllocateStrings(proto.reserved_name(i));
    }

    AddSymbol(result->full_name(), parent, result->name(), proto, Symbol(result));
}

1	展开 EnumValue。

从上面的代码可以看出：protoc 在解析文件的过程中，将文件中的所有类型展开到了 DescriptorPool 中。使得类型呈现出了一级扁平的、交叉的结构。