486 lines
17 KiB
C++
486 lines
17 KiB
C++
//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
|
||
//
|
||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
// See https://llvm.org/LICENSE.txt for license information.
|
||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
//
|
||
// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
|
||
// using UnicodeData.txt and NameAliases.txt available at
|
||
// https://unicode.org/Public/15.0.0/ucd/
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
#include "llvm/ADT/Optional.h"
|
||
#include "llvm/ADT/STLExtras.h"
|
||
#include "llvm/ADT/StringExtras.h"
|
||
#include "llvm/ADT/StringRef.h"
|
||
#include <algorithm>
|
||
#include <array>
|
||
#include <deque>
|
||
#include <fstream>
|
||
#include <memory>
|
||
#include <set>
|
||
#include <string>
|
||
#include <unordered_map>
|
||
#include <utility>
|
||
#include <vector>
|
||
|
||
static const llvm::StringRef Letters =
|
||
" _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
|
||
|
||
// Collect names UnicodeData.txt and AliasNames.txt
|
||
// There may be multiple names per code points.
|
||
static std::unordered_multimap<char32_t, std::string>
|
||
loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
|
||
std::unordered_multimap<char32_t, std::string> CollectedCharacters;
|
||
auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
|
||
std::ifstream InputFile(File);
|
||
for (std::string Line; getline(InputFile, Line);) {
|
||
if (Line.empty() || !isxdigit(Line[0]))
|
||
continue;
|
||
auto FirstSemiPos = Line.find(';');
|
||
if (FirstSemiPos == std::string::npos)
|
||
continue;
|
||
auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
|
||
if (SecondSemiPos == std::string::npos)
|
||
continue;
|
||
unsigned long long CodePoint;
|
||
if (llvm::getAsUnsignedInteger(
|
||
llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
|
||
continue;
|
||
}
|
||
|
||
std::string Name =
|
||
Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
|
||
|
||
if (!Name.empty() && Name[0] == '<') {
|
||
// Ignore ranges of characters, as their name is either absent or
|
||
// generated.
|
||
continue;
|
||
}
|
||
|
||
// Some aliases are ignored for compatibility with C++
|
||
if (IsAliasFile) {
|
||
std::string Kind = Line.substr(SecondSemiPos + 1);
|
||
if (Kind != "control" && Kind != "correction" && Kind != "alternate")
|
||
continue;
|
||
}
|
||
|
||
auto InsertUnique = [&](char32_t CP, std::string Name) {
|
||
auto It = CollectedCharacters.find(CP);
|
||
while (It != std::end(CollectedCharacters) && It->first == CP) {
|
||
if (It->second == Name)
|
||
return;
|
||
++It;
|
||
}
|
||
CollectedCharacters.insert({CP, std::move(Name)});
|
||
};
|
||
InsertUnique(CodePoint, std::move(Name));
|
||
}
|
||
};
|
||
|
||
FromFile(NamesFile);
|
||
FromFile(AliasesFile, true);
|
||
return CollectedCharacters;
|
||
}
|
||
|
||
class Trie {
|
||
struct Node;
|
||
|
||
public:
|
||
// When inserting named codepoint
|
||
// We create a node per character in the name.
|
||
// SPARKLE becomes S <- P <- A <- R <- K <- L <- E
|
||
// Once all characters are inserted, the tree is compacted
|
||
void insert(llvm::StringRef Name, char32_t Codepoint) {
|
||
Node *N = Root.get();
|
||
for (auto Ch : Name) {
|
||
std::string Label(1, Ch);
|
||
auto It = llvm::find_if(N->Children,
|
||
[&](const auto &C) { return C->Name == Label; });
|
||
if (It == N->Children.end()) {
|
||
It = N->Children.insert(It, std::make_unique<Node>(Label, N));
|
||
}
|
||
N = It->get();
|
||
}
|
||
N->Value = Codepoint;
|
||
}
|
||
|
||
void compact() { compact(Root.get()); }
|
||
|
||
// This creates 2 arrays of bytes from the tree:
|
||
// A serialized dictionary of node labels,
|
||
// And the nodes themselves.
|
||
// The name of each label is found by indexing into the dictionary.
|
||
// The longest names are inserted first into the dictionary,
|
||
// in the hope it will contain shorter labels as substring,
|
||
// thereby reducing duplication.
|
||
// We could theorically be more clever by trying to minimizing the size
|
||
// of the dictionary.
|
||
std::pair<std::string, std::vector<uint8_t>> serialize() {
|
||
std::set<std::string> Names = this->getNameFragments();
|
||
std::vector<std::string> Sorted(Names.begin(), Names.end());
|
||
llvm::sort(Sorted, [](const auto &a, const auto &b) {
|
||
return a.size() > b.size();
|
||
});
|
||
std::string Dict(Letters.begin(), Letters.end());
|
||
Dict.reserve(50000);
|
||
for (const std::string &Name : Sorted) {
|
||
if (Name.size() <= 1)
|
||
continue;
|
||
if (Dict.find(Name) != std::string::npos)
|
||
continue;
|
||
Dict += Name;
|
||
}
|
||
|
||
if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
|
||
fprintf(stderr, "Dictionary too big to be serialized");
|
||
exit(1);
|
||
}
|
||
|
||
auto Bytes = dumpIndex(Dict);
|
||
return {Dict, Bytes};
|
||
}
|
||
|
||
std::set<std::string> getNameFragments() {
|
||
std::set<std::string> Keys;
|
||
collectKeys(Root.get(), Keys);
|
||
return Keys;
|
||
}
|
||
|
||
// Maps a valid char in an Unicode character name
|
||
// To a 6 bits index.
|
||
static uint8_t letter(char C) {
|
||
auto Pos = Letters.find(C);
|
||
assert(Pos != std::string::npos &&
|
||
"Invalid letter in Unicode character name");
|
||
return Pos;
|
||
}
|
||
|
||
// clang-format off
|
||
// +================+============+======================+=============+========+===+==============+===============+
|
||
// | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
|
||
// +================+============+======================+=============+========+===+==============+===============+
|
||
// | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
|
||
// +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
|
||
// clang-format on
|
||
|
||
std::vector<uint8_t> dumpIndex(const std::string &Dict) {
|
||
struct ChildrenOffset {
|
||
Node *FirstChild;
|
||
std::size_t Offset;
|
||
bool HasValue;
|
||
};
|
||
|
||
// Keep track of the start of each node
|
||
// position in the serialized data.
|
||
std::unordered_map<Node *, int32_t> Offsets;
|
||
|
||
// Keep track of where to write the index
|
||
// of the first children
|
||
std::vector<ChildrenOffset> ChildrenOffsets;
|
||
std::unordered_map<Node *, bool> SiblingTracker;
|
||
std::deque<Node *> AllNodes;
|
||
std::vector<uint8_t> Bytes;
|
||
Bytes.reserve(250'000);
|
||
// This leading byte is used by the reading code to detect the root node.
|
||
Bytes.push_back(0);
|
||
|
||
auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
|
||
for (std::size_t Index = 0; Index < Children.size(); Index++) {
|
||
const std::unique_ptr<Node> &Child = Children[Index];
|
||
AllNodes.push_back(Child.get());
|
||
if (Index != Children.size() - 1)
|
||
SiblingTracker[Child.get()] = true;
|
||
}
|
||
};
|
||
CollectChildren(Root->Children);
|
||
|
||
while (!AllNodes.empty()) {
|
||
const std::size_t Offset = Bytes.size();
|
||
Node *const N = AllNodes.front();
|
||
AllNodes.pop_front();
|
||
|
||
assert(!N->Name.empty());
|
||
Offsets[N] = Offset;
|
||
|
||
uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
|
||
// Single letter node are indexed in 6 bits
|
||
if (N->Name.size() == 1) {
|
||
FirstByte |= letter(N->Name[0]);
|
||
Bytes.push_back(FirstByte);
|
||
} else {
|
||
// Otherwise we use a 16 bits index
|
||
FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
|
||
Bytes.push_back(FirstByte);
|
||
auto PosInDict = Dict.find(N->Name);
|
||
assert(PosInDict != std::string::npos);
|
||
uint8_t Low = PosInDict;
|
||
uint8_t High = ((PosInDict >> 8) & 0xFF);
|
||
Bytes.push_back(High);
|
||
Bytes.push_back(Low);
|
||
}
|
||
|
||
const bool HasSibling = SiblingTracker.count(N) != 0;
|
||
const bool HasChildren = N->Children.size() != 0;
|
||
|
||
if (!!N->Value) {
|
||
uint32_t Value = (*(N->Value) << 3);
|
||
uint8_t H = ((Value >> 16) & 0xFF);
|
||
uint8_t M = ((Value >> 8) & 0xFF);
|
||
uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
|
||
uint8_t(HasChildren ? 0x02 : 0);
|
||
|
||
Bytes.push_back(H);
|
||
Bytes.push_back(M);
|
||
Bytes.push_back(L);
|
||
|
||
if (HasChildren) {
|
||
ChildrenOffsets.push_back(
|
||
ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
|
||
// index of the first children
|
||
Bytes.push_back(0x00);
|
||
Bytes.push_back(0x00);
|
||
Bytes.push_back(0x00);
|
||
}
|
||
} else {
|
||
// When there is no value (that's most intermediate nodes)
|
||
// Dispense of the 3 values bytes, and only store
|
||
// 1 byte to track whether the node has sibling and children
|
||
// + 2 bytes for the index of the first children if necessary.
|
||
// That index also uses bytes 0-6 of the previous byte.
|
||
uint8_t Byte =
|
||
uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
|
||
Bytes.push_back(Byte);
|
||
if (HasChildren) {
|
||
ChildrenOffsets.emplace_back(
|
||
ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
|
||
Bytes.push_back(0x00);
|
||
Bytes.push_back(0x00);
|
||
}
|
||
}
|
||
CollectChildren(N->Children);
|
||
}
|
||
|
||
// Once all the nodes are in the inndex
|
||
// Fill the bytes we left to indicate the position
|
||
// of the children
|
||
for (const ChildrenOffset &Parent : ChildrenOffsets) {
|
||
const auto It = Offsets.find(Parent.FirstChild);
|
||
assert(It != Offsets.end());
|
||
std::size_t Pos = It->second;
|
||
if (Parent.HasValue) {
|
||
Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
|
||
} else {
|
||
Bytes[Parent.Offset] =
|
||
Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
|
||
}
|
||
Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
|
||
Bytes[Parent.Offset + 2] = Pos & 0xFF;
|
||
}
|
||
|
||
// Add some padding so that the deserialization code
|
||
// doesn't try to read past the enf of the array.
|
||
Bytes.push_back(0);
|
||
Bytes.push_back(0);
|
||
Bytes.push_back(0);
|
||
Bytes.push_back(0);
|
||
Bytes.push_back(0);
|
||
Bytes.push_back(0);
|
||
|
||
return Bytes;
|
||
}
|
||
|
||
private:
|
||
void collectKeys(Node *N, std::set<std::string> &Keys) {
|
||
Keys.insert(N->Name);
|
||
for (const std::unique_ptr<Node> &Child : N->Children) {
|
||
collectKeys(Child.get(), Keys);
|
||
}
|
||
}
|
||
|
||
// Merge sequences of 1-character nodes
|
||
// This greatly reduce the total number of nodes,
|
||
// and therefore the size of the index.
|
||
// When the tree gets serialized, we only have 5 bytes to store the
|
||
// size of a name. Overlong names (>32 characters) are therefore
|
||
// kep into separate nodes
|
||
void compact(Node *N) {
|
||
for (auto &&Child : N->Children) {
|
||
compact(Child.get());
|
||
}
|
||
if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
|
||
(N->Parent->Name.size() + N->Name.size() <= 32)) {
|
||
N->Parent->Value = N->Value;
|
||
N->Parent->Name += N->Name;
|
||
N->Parent->Children = std::move(N->Children);
|
||
for (std::unique_ptr<Node> &c : N->Parent->Children) {
|
||
c->Parent = N->Parent;
|
||
}
|
||
}
|
||
}
|
||
struct Node {
|
||
Node(std::string Name, Node *Parent = nullptr)
|
||
: Name(Name), Parent(Parent) {}
|
||
|
||
std::vector<std::unique_ptr<Node>> Children;
|
||
std::string Name;
|
||
Node *Parent = nullptr;
|
||
llvm::Optional<char32_t> Value;
|
||
};
|
||
|
||
std::unique_ptr<Node> Root = std::make_unique<Node>("");
|
||
};
|
||
|
||
extern const char *UnicodeLicense;
|
||
|
||
int main(int argc, char **argv) {
|
||
printf("Unicode name -> codepoint mapping generator\n"
|
||
"Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
|
||
argv[0]);
|
||
printf("NameAliases.txt can be found at "
|
||
"https://unicode.org/Public/15.0.0/ucd/NameAliases.txt\n"
|
||
"UnicodeData.txt can be found at "
|
||
"https://unicode.org/Public/15.0.0/ucd/UnicodeData.txt\n\n");
|
||
|
||
if (argc != 4)
|
||
return EXIT_FAILURE;
|
||
|
||
FILE *Out = fopen(argv[3], "w");
|
||
if (!Out) {
|
||
printf("Error creating output file.\n");
|
||
return EXIT_FAILURE;
|
||
}
|
||
|
||
Trie T;
|
||
uint32_t NameCount = 0;
|
||
std::size_t LongestName = 0;
|
||
auto Entries = loadDataFiles(argv[1], argv[2]);
|
||
for (const std::pair<const char32_t, std::string> &Entry : Entries) {
|
||
char32_t Codepoint = Entry.first;
|
||
const std::string &Name = Entry.second;
|
||
// Ignore names which are not valid.
|
||
if (Name.empty() || !llvm::all_of(Name, [](char C) {
|
||
return llvm::is_contained(Letters, C);
|
||
})) {
|
||
continue;
|
||
}
|
||
printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
|
||
T.insert(Name, Codepoint);
|
||
LongestName =
|
||
std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
|
||
NameCount++;
|
||
}
|
||
T.compact();
|
||
|
||
std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
|
||
const std::string &Dict = Data.first;
|
||
const std::vector<uint8_t> &Tree = Data.second;
|
||
|
||
fprintf(Out, R"(
|
||
//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
|
||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
// See https://llvm.org/LICENSE.txt for license information.
|
||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
//
|
||
// This file implements mapping the name of a unicode code point to its value.
|
||
//
|
||
// This file was generated using %s.
|
||
// Do not edit manually.
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
%s
|
||
|
||
|
||
|
||
#include "llvm/Support/Compiler.h"
|
||
#include <cstddef>
|
||
#include <cstdint>
|
||
)",
|
||
argv[0], UnicodeLicense);
|
||
|
||
fprintf(Out,
|
||
"namespace llvm { namespace sys { namespace unicode { \n"
|
||
"extern const char *UnicodeNameToCodepointDict;\n"
|
||
"extern const uint8_t *UnicodeNameToCodepointIndex;\n"
|
||
"extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
|
||
"extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
|
||
|
||
fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
|
||
Dict.c_str());
|
||
|
||
fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
|
||
Tree.size() + 1);
|
||
|
||
for (auto Byte : Tree) {
|
||
fprintf(Out, "0x%02x,", Byte);
|
||
}
|
||
|
||
fprintf(Out, "0};");
|
||
fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
|
||
"UnicodeNameToCodepointIndex_; \n");
|
||
fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
|
||
Tree.size() + 1);
|
||
fprintf(Out,
|
||
"const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
|
||
LongestName);
|
||
fprintf(Out, "\n}}}\n");
|
||
fclose(Out);
|
||
printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
|
||
argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
|
||
}
|
||
|
||
const char *UnicodeLicense = R"(
|
||
/*
|
||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||
|
||
See Terms of Use <https://www.unicode.org/copyright.html>
|
||
for definitions of Unicode Inc.’s Data Files and Software.
|
||
|
||
NOTICE TO USER: Carefully read the following legal agreement.
|
||
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||
TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||
THE DATA FILES OR SOFTWARE.
|
||
|
||
COPYRIGHT AND PERMISSION NOTICE
|
||
|
||
Copyright © 1991-2022 Unicode, Inc. All rights reserved.
|
||
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||
|
||
Permission is hereby granted, free of charge, to any person obtaining
|
||
a copy of the Unicode data files and any associated documentation
|
||
(the "Data Files") or Unicode software and any associated documentation
|
||
(the "Software") to deal in the Data Files or Software
|
||
without restriction, including without limitation the rights to use,
|
||
copy, modify, merge, publish, distribute, and/or sell copies of
|
||
the Data Files or Software, and to permit persons to whom the Data Files
|
||
or Software are furnished to do so, provided that either
|
||
(a) this copyright and permission notice appear with all copies
|
||
of the Data Files or Software, or
|
||
(b) this copyright and permission notice appear in associated
|
||
Documentation.
|
||
|
||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||
|
||
Except as contained in this notice, the name of a copyright holder
|
||
shall not be used in advertising or otherwise to promote the sale,
|
||
use or other dealings in these Data Files or Software without prior
|
||
written authorization of the copyright holder.
|
||
*/
|
||
)";
|