253 lines
		
	
	
		
			8.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			253 lines
		
	
	
		
			8.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
| #!/usr/bin/env python
 | |
| # ===----------------------------------------------------------------------===##
 | |
| #
 | |
| # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | |
| # See https://llvm.org/LICENSE.txt for license information.
 | |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | |
| #
 | |
| # ===----------------------------------------------------------------------===##
 | |
| 
 | |
| # The code is based on
 | |
| # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
 | |
| #
 | |
| # Copyright (c) Microsoft Corporation.
 | |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | |
| 
 | |
| from pathlib import Path
 | |
| from dataclasses import dataclass, field
 | |
| from typing import Optional, TextIO
 | |
| from array import array
 | |
| import sys
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class BreakTestItem:
 | |
|     code_points: list[int] = field(default_factory=list)
 | |
|     encoded: str = ""
 | |
|     breaks_utf8: list[int] = field(default_factory=list)
 | |
|     breaks_utf16: list[int] = field(default_factory=list)
 | |
|     breaks_utf32: list[int] = field(default_factory=list)
 | |
| 
 | |
| 
 | |
| class CommentLine:
 | |
|     pass
 | |
| 
 | |
| 
 | |
| class EOF:
 | |
|     pass
 | |
| 
 | |
| 
 | |
| def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
 | |
|     result = BreakTestItem()
 | |
|     code_point = -1
 | |
|     utf8 = 0
 | |
|     utf16 = 0
 | |
|     utf32 = 0
 | |
| 
 | |
|     while True:
 | |
|         c = input.read(1)
 | |
|         if c == "\N{DIVISION SIGN}":
 | |
|             # The line starts with a division sign, don't add it to the output.
 | |
|             if code_point != -1:
 | |
|                 result.code_points.append(code_point)
 | |
|                 code_point = -1
 | |
|                 result.breaks_utf8.append(utf8)
 | |
|                 result.breaks_utf16.append(utf16)
 | |
|                 result.breaks_utf32.append(utf32)
 | |
| 
 | |
|             assert input.read(1).isspace()
 | |
|             continue
 | |
|         if c == "\N{MULTIPLICATION SIGN}":
 | |
|             assert input.read(1).isspace()
 | |
|             continue
 | |
|         if c.isalnum():
 | |
|             while next := input.read(1):
 | |
|                 if next.isalnum():
 | |
|                     c += next
 | |
|                 else:
 | |
|                     assert next.isspace()
 | |
|                     break
 | |
|             i = int(c, base=16)
 | |
|             if code_point == -1:
 | |
|                 code_point = i
 | |
| 
 | |
|             result.encoded += f"\\U{i:08x}"
 | |
|             c = chr(i)
 | |
|             utf8 += c.encode().__len__()
 | |
|             # Since we only care about the number of code units the byte order
 | |
|             # doesn't matter. The byte order is specified to avoid the BOM
 | |
|             utf16 += int(c.encode("utf-16-le").__len__() / 2)
 | |
|             utf32 += int(c.encode("utf-32-le").__len__() / 4)
 | |
|             continue
 | |
|         if c == "#":
 | |
|             input.readline()
 | |
|             return result
 | |
|         if c == "\n":
 | |
|             return result
 | |
|         if c == "":
 | |
|             return None
 | |
|         assert False
 | |
| 
 | |
| 
 | |
| cpp_template = """// -*- C++ -*-
 | |
| //===----------------------------------------------------------------------===//
 | |
| //
 | |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | |
| // See https://llvm.org/LICENSE.txt for license information.
 | |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | |
| //
 | |
| //===----------------------------------------------------------------------===//
 | |
| 
 | |
| // WARNING, this entire header is generated by
 | |
| // utiles/generate_extended_grapheme_cluster_test.py
 | |
| // DO NOT MODIFY!
 | |
| 
 | |
| // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 | |
| //
 | |
| // See Terms of Use <https://www.unicode.org/copyright.html>
 | |
| // for definitions of Unicode Inc.'s Data Files and Software.
 | |
| //
 | |
| // NOTICE TO USER: Carefully read the following legal agreement.
 | |
| // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 | |
| // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 | |
| // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 | |
| // TERMS AND CONDITIONS OF THIS AGREEMENT.
 | |
| // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
 | |
| // THE DATA FILES OR SOFTWARE.
 | |
| //
 | |
| // COPYRIGHT AND PERMISSION NOTICE
 | |
| //
 | |
| // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
 | |
| // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
 | |
| //
 | |
| // Permission is hereby granted, free of charge, to any person obtaining
 | |
| // a copy of the Unicode data files and any associated documentation
 | |
| // (the "Data Files") or Unicode software and any associated documentation
 | |
| // (the "Software") to deal in the Data Files or Software
 | |
| // without restriction, including without limitation the rights to use,
 | |
| // copy, modify, merge, publish, distribute, and/or sell copies of
 | |
| // the Data Files or Software, and to permit persons to whom the Data Files
 | |
| // or Software are furnished to do so, provided that either
 | |
| // (a) this copyright and permission notice appear with all copies
 | |
| // of the Data Files or Software, or
 | |
| // (b) this copyright and permission notice appear in associated
 | |
| // Documentation.
 | |
| //
 | |
| // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 | |
| // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 | |
| // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 | |
| // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 | |
| // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 | |
| // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 | |
| // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 | |
| // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 | |
| // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 | |
| // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 | |
| //
 | |
| // Except as contained in this notice, the name of a copyright holder
 | |
| // shall not be used in advertising or otherwise to promote the sale,
 | |
| // use or other dealings in these Data Files or Software without prior
 | |
| // written authorization of the copyright holder.
 | |
| 
 | |
| #ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
 | |
| #define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
 | |
| 
 | |
| #include <array>
 | |
| #include <string_view>
 | |
| #include <vector>
 | |
| 
 | |
| template <class CharT>
 | |
| struct data {{
 | |
|   /// The input to parse.
 | |
|   std::basic_string_view<CharT> input;
 | |
| 
 | |
|   /// The first code point all extended grapheme clusters in the input.
 | |
|   std::vector<char32_t> code_points;
 | |
| 
 | |
|   /// The offset of the last code units of the extended grapheme clusters in the input.
 | |
|   ///
 | |
|   /// The vector has the same number of entries as \\ref code_points.
 | |
|   std::vector<size_t> breaks;
 | |
| }};
 | |
| 
 | |
| /// The data for UTF-8.
 | |
| std::array<data<char>, {0}> data_utf8 = {{{{
 | |
| {1}}}}};
 | |
| 
 | |
| /// The data for UTF-16.
 | |
| ///
 | |
| /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
 | |
| /// since the size of the code units differ the breaks can contain different
 | |
| /// values.
 | |
| std::array<data<wchar_t>, {0}> data_utf16 = {{{{
 | |
| {2}}}}};
 | |
| 
 | |
| /// The data for UTF-8.
 | |
| ///
 | |
| /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
 | |
| /// since the size of the code units differ the breaks can contain different
 | |
| /// values.
 | |
| std::array<data<wchar_t>, {0}> data_utf32 = {{{{
 | |
| {3}}}}};
 | |
| 
 | |
| #endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
 | |
| 
 | |
| cpp_test_data_line_template = "     {{{}, {{{}}}, {{{}}}}}"
 | |
| 
 | |
| 
 | |
| def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
 | |
|     return cpp_test_data_line_template.format(
 | |
|         f'"{line.encoded}"',
 | |
|         ", ".join([str(x) for x in line.code_points]),
 | |
|         ", ".join([str(x) for x in line.breaks_utf8]),
 | |
|     )
 | |
| 
 | |
| 
 | |
| def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
 | |
|     return cpp_test_data_line_template.format(
 | |
|         f'L"{line.encoded}"',
 | |
|         ", ".join([str(x) for x in line.code_points]),
 | |
|         ", ".join([str(x) for x in line.breaks_utf16]),
 | |
|     )
 | |
| 
 | |
| 
 | |
| def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
 | |
|     return cpp_test_data_line_template.format(
 | |
|         f'L"{line.encoded}"',
 | |
|         ", ".join([str(x) for x in line.code_points]),
 | |
|         ", ".join([str(x) for x in line.breaks_utf32]),
 | |
|     )
 | |
| 
 | |
| 
 | |
| """
 | |
| Generate test data from "GraphemeBreakText.txt"
 | |
| This file can be downloaded from:
 | |
| https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
 | |
| This script looks for GraphemeBreakTest.txt in same directory as this script
 | |
| """
 | |
| 
 | |
| 
 | |
| def generate_all() -> str:
 | |
|     test_data_path = Path(__file__)
 | |
|     test_data_path = test_data_path.absolute()
 | |
|     test_data_path = (
 | |
|         test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
 | |
|     )
 | |
|     lines = list()
 | |
|     with open(test_data_path, mode="rt", encoding="utf-8") as file:
 | |
|         while line := parseBreakTestLine(file):
 | |
|             if len(line.encoded) > 0:
 | |
|                 lines.append(line)
 | |
|     return cpp_template.format(
 | |
|         len(lines),
 | |
|         ",\n".join(map(lineToCppDataLineUtf8, lines)),
 | |
|         ",\n".join(map(lineToCppDataLineUtf16, lines)),
 | |
|         ",\n".join(map(lineToCppDataLineUtf32, lines)),
 | |
|     )
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     if len(sys.argv) == 2:
 | |
|         sys.stdout = open(sys.argv[1], "w")
 | |
|     print(generate_all())
 |