diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/utfcpp/v3/.circleci/config.yml | 13 | ||||
-rw-r--r-- | lib/utfcpp/v3/CMakeLists.txt | 62 | ||||
-rw-r--r-- | lib/utfcpp/v3/samples/docsample.cpp | 64 | ||||
-rw-r--r-- | lib/utfcpp/v3/utf8cppConfig.cmake.in | 6 | ||||
-rw-r--r-- | lib/utfcpp/v4/.github/workflows/cmake-multi-platform.yml | 75 | ||||
-rw-r--r-- | lib/utfcpp/v4/.gitignore (renamed from lib/utfcpp/v3/.gitignore) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/.gitmodules (renamed from lib/utfcpp/v3/.gitmodules) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/CMakeLists.txt | 52 | ||||
-rw-r--r-- | lib/utfcpp/v4/LICENSE (renamed from lib/utfcpp/v3/LICENSE) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/README.md (renamed from lib/utfcpp/v3/README.md) | 1036 | ||||
-rw-r--r-- | lib/utfcpp/v4/source/utf8.h (renamed from lib/utfcpp/v3/source/utf8.h) | 12 | ||||
-rw-r--r-- | lib/utfcpp/v4/source/utf8/checked.h (renamed from lib/utfcpp/v3/source/utf8/checked.h) | 104 | ||||
-rw-r--r-- | lib/utfcpp/v4/source/utf8/core.h (renamed from lib/utfcpp/v3/source/utf8/core.h) | 205 | ||||
-rw-r--r-- | lib/utfcpp/v4/source/utf8/cpp11.h (renamed from lib/utfcpp/v3/source/utf8/cpp11.h) | 37 | ||||
-rw-r--r-- | lib/utfcpp/v4/source/utf8/cpp17.h (renamed from lib/utfcpp/v3/source/utf8/cpp17.h) | 9 | ||||
-rw-r--r-- | lib/utfcpp/v4/source/utf8/cpp20.h | 124 | ||||
-rw-r--r-- | lib/utfcpp/v4/source/utf8/unchecked.h (renamed from lib/utfcpp/v3/source/utf8/unchecked.h) | 88 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/CMakeLists.txt (renamed from lib/utfcpp/v3/tests/CMakeLists.txt) | 35 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/apitests.cpp (renamed from lib/utfcpp/v3/tests/apitests.cpp) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/docker/Dockerfile (renamed from lib/utfcpp/v3/tests/docker/Dockerfile) | 2 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/negative.cpp (renamed from lib/utfcpp/v3/tests/negative.cpp) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/noexceptionstests.cpp (renamed from lib/utfcpp/v3/tests/noexceptionstests.cpp) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/test_checked_api.h (renamed from lib/utfcpp/v3/tests/test_checked_api.h) | 37 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/test_checked_iterator.h (renamed from lib/utfcpp/v3/tests/test_checked_iterator.h) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/test_cpp11.cpp (renamed from lib/utfcpp/v3/tests/test_cpp11.cpp) | 13 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/test_cpp17.cpp (renamed from lib/utfcpp/v3/tests/test_cpp17.cpp) | 4 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/test_cpp20.cpp | 77 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/test_data/utf8_invalid.txt (renamed from lib/utfcpp/v3/tests/test_data/utf8_invalid.txt) | bin | 20010 -> 20010 bytes | |||
-rw-r--r-- | lib/utfcpp/v4/tests/test_unchecked_api.h (renamed from lib/utfcpp/v3/tests/test_unchecked_api.h) | 34 | ||||
-rw-r--r-- | lib/utfcpp/v4/tests/test_unchecked_iterator.h (renamed from lib/utfcpp/v3/tests/test_unchecked_iterator.h) | 0 | ||||
-rw-r--r-- | lib/utfcpp/v4/utf8cppConfig.cmake.in | 8 |
31 files changed, 1579 insertions, 518 deletions
diff --git a/lib/utfcpp/v3/.circleci/config.yml b/lib/utfcpp/v3/.circleci/config.yml deleted file mode 100644 index 2588646d..00000000 --- a/lib/utfcpp/v3/.circleci/config.yml +++ /dev/null @@ -1,13 +0,0 @@ -version: 2 - -jobs: - build: - docker: - - image: nemtrif/utf8cpp:3.1.3 - steps: - - checkout - - run: git submodule update --init --recursive --remote - - run: mkdir build - - run: cd build && cmake .. - - run: cd build && cmake --build . - - run: cd build && ctest -VV diff --git a/lib/utfcpp/v3/CMakeLists.txt b/lib/utfcpp/v3/CMakeLists.txt deleted file mode 100644 index c8d4b7a3..00000000 --- a/lib/utfcpp/v3/CMakeLists.txt +++ /dev/null @@ -1,62 +0,0 @@ -cmake_minimum_required (VERSION 3.0.2) -project (utf8cpp VERSION 3.2.2 LANGUAGES CXX) - -if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - set(IS_ROOT_PROJECT ON) -else() - set(IS_ROOT_PROJECT OFF) -endif() - -option(UTF8_TESTS "Enable tests for UTF8-CPP" ${IS_ROOT_PROJECT}) -option(UTF8_INSTALL "Enable installation for UTF8-CPP" ${IS_ROOT_PROJECT}) -option(UTF8_SAMPLES "Enable building samples for UTF8-CPP" ${IS_ROOT_PROJECT}) - -add_library(utf8cpp INTERFACE) -target_include_directories(utf8cpp INTERFACE - "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/source>" - $<INSTALL_INTERFACE:include/utf8cpp> -) -add_library(utf8::cpp ALIAS utf8cpp) - -if(UTF8_INSTALL) - include(CMakePackageConfigHelpers) - if(MSVC) - set(DEF_INSTALL_CMAKE_DIR CMake) - else() - include(GNUInstallDirs) # define CMAKE_INSTALL_* - set(DEF_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/utf8cpp) - endif() - - write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfigVersion.cmake - VERSION ${PROJECT_VERSION} - COMPATIBILITY SameMajorVersion - ) - - configure_package_config_file( - ${PROJECT_SOURCE_DIR}/utf8cppConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfig.cmake - INSTALL_DESTINATION ${DEF_INSTALL_CMAKE_DIR} - ) - - install(DIRECTORY source/ DESTINATION include/utf8cpp) - install(TARGETS utf8cpp EXPORT utf8cppTargets) - install(EXPORT utf8cppTargets DESTINATION ${DEF_INSTALL_CMAKE_DIR}) - install( - FILES - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfigVersion.cmake - DESTINATION - ${DEF_INSTALL_CMAKE_DIR} - ) -endif() - -if(UTF8_SAMPLES) - add_executable(docsample ${PROJECT_SOURCE_DIR}/samples/docsample.cpp) - target_link_libraries(docsample PRIVATE utf8::cpp) -endif() - -if(UTF8_TESTS) - enable_testing() - add_subdirectory(tests) -endif() diff --git a/lib/utfcpp/v3/samples/docsample.cpp b/lib/utfcpp/v3/samples/docsample.cpp deleted file mode 100644 index 65338872..00000000 --- a/lib/utfcpp/v3/samples/docsample.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include "../source/utf8.h" -#include <iostream> -#include <fstream> -#include <string> -#include <vector> - - -using namespace std; - -int main(int argc, char** argv) -{ - if (argc != 2) { - cout << "\nUsage: docsample filename\n"; - return 0; - } - const char* test_file_path = argv[1]; - // Open the test file (must be UTF-8 encoded) - ifstream fs8(test_file_path); - if (!fs8.is_open()) { - cout << "Could not open " << test_file_path << endl; - return 0; - } - - unsigned line_count = 1; - string line; - // Play with all the lines in the file - while (getline(fs8, line)) { - // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) -#if __cplusplus >= 201103L // C++ 11 or later - auto end_it = utf8::find_invalid(line.begin(), line.end()); -#else - string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); -#endif // C++ 11 - if (end_it != line.end()) { - cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; - cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; - } - // Get the line length (at least for the valid part) - ptrdiff_t length = utf8::distance(line.begin(), end_it); - cout << "Length of line " << line_count << " is " << length << "\n"; - - // Convert it to utf-16 -#if __cplusplus >= 201103L // C++ 11 or later - u16string utf16line = utf8::utf8to16(line); -#else - vector<unsigned short> utf16line; - utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); -#endif // C++ 11 - // And back to utf-8; -#if __cplusplus >= 201103L // C++ 11 or later - string utf8line = utf8::utf16to8(utf16line); -#else - string utf8line; - utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); -#endif // C++ 11 - // Confirm that the conversion went OK: - if (utf8line != string(line.begin(), end_it)) - cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; - - line_count++; - } - - return 0; -} diff --git a/lib/utfcpp/v3/utf8cppConfig.cmake.in b/lib/utfcpp/v3/utf8cppConfig.cmake.in deleted file mode 100644 index 450fe8d1..00000000 --- a/lib/utfcpp/v3/utf8cppConfig.cmake.in +++ /dev/null @@ -1,6 +0,0 @@ -@PACKAGE_INIT@ - -include("${CMAKE_CURRENT_LIST_DIR}/utf8cppTargets.cmake") -check_required_components( "utf8cpp" ) - -add_library(utf8::cpp ALIAS utf8cpp) diff --git a/lib/utfcpp/v4/.github/workflows/cmake-multi-platform.yml b/lib/utfcpp/v4/.github/workflows/cmake-multi-platform.yml new file mode 100644 index 00000000..b744790e --- /dev/null +++ b/lib/utfcpp/v4/.github/workflows/cmake-multi-platform.yml @@ -0,0 +1,75 @@ +# This starter workflow is for a CMake project running on multiple platforms. There is a different starter workflow if you just want a single platform. +# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml +name: CMake on multiple platforms + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + + strategy: + # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. + fail-fast: false + + # Set up a matrix to run the following 3 configurations: + # 1. <Windows, Release, latest MSVC compiler toolchain on the default runner image, default generator> + # 2. <Linux, Release, latest GCC compiler toolchain on the default runner image, default generator> + # 3. <Linux, Release, latest Clang compiler toolchain on the default runner image, default generator> + # + # To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list. + matrix: + os: [ubuntu-latest, windows-latest] + build_type: [Release] + c_compiler: [gcc, clang, cl] + include: + - os: windows-latest + c_compiler: cl + cpp_compiler: cl + - os: ubuntu-latest + c_compiler: gcc + cpp_compiler: g++ + - os: ubuntu-latest + c_compiler: clang + cpp_compiler: clang++ + exclude: + - os: windows-latest + c_compiler: gcc + - os: windows-latest + c_compiler: clang + - os: ubuntu-latest + c_compiler: cl + + steps: + - uses: actions/checkout@v3 + + - name: Set reusable strings + # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. + id: strings + shell: bash + run: | + echo "build-output-dir=${{ github.workspace }}/tests/build" >> "$GITHUB_OUTPUT" + + - name: Configure CMake + # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. + # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} + -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -S ${{ github.workspace }} + + - name: Build + # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Test + working-directory: ${{ steps.strings.outputs.build-output-dir }} + # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). + # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail + run: ctest -VV --build-config ${{ matrix.build_type }} diff --git a/lib/utfcpp/v3/.gitignore b/lib/utfcpp/v4/.gitignore index 488d51dd..488d51dd 100644 --- a/lib/utfcpp/v3/.gitignore +++ b/lib/utfcpp/v4/.gitignore diff --git a/lib/utfcpp/v3/.gitmodules b/lib/utfcpp/v4/.gitmodules index 424f86b7..424f86b7 100644 --- a/lib/utfcpp/v3/.gitmodules +++ b/lib/utfcpp/v4/.gitmodules diff --git a/lib/utfcpp/v4/CMakeLists.txt b/lib/utfcpp/v4/CMakeLists.txt new file mode 100644 index 00000000..842c10ff --- /dev/null +++ b/lib/utfcpp/v4/CMakeLists.txt @@ -0,0 +1,52 @@ +# This file is deprecated and will be removed in a future release +# Please see the instructions for installation in README.md file + +cmake_minimum_required (VERSION 3.5...3.27) +project (utf8cpp + VERSION 4.0.4 + LANGUAGES CXX + DESCRIPTION "C++ portable library for working with utf-8 encoding") + +add_library(${PROJECT_NAME} INTERFACE) + +include(GNUInstallDirs) + +target_include_directories(utf8cpp INTERFACE + "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/source>" + $<INSTALL_INTERFACE:include/utf8cpp> +) + +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + "${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" + VERSION ${PROJECT_VERSION} + COMPATIBILITY SameMajorVersion +) + +install(TARGETS ${PROJECT_NAME} + EXPORT ${PROJECT_NAME}Targets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + PUBLIC_HEADER DESTINATION include COMPONENT Development + BUNDLE DESTINATION bin COMPONENT Runtime +) + +configure_package_config_file( + "${PROJECT_SOURCE_DIR}/${PROJECT_NAME}Config.cmake.in" + "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake +) + +install(EXPORT ${PROJECT_NAME}Targets + FILE ${PROJECT_NAME}Targets.cmake + NAMESPACE ${PROJECT_NAME}:: + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake) + +install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" + "${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake) + +install(FILES ${PROJECT_SOURCE_DIR}/source/utf8.h DESTINATION include/utf8cpp) +install(DIRECTORY ${PROJECT_SOURCE_DIR}/source/utf8 DESTINATION + include/utf8cpp) diff --git a/lib/utfcpp/v3/LICENSE b/lib/utfcpp/v4/LICENSE index 36b7cd93..36b7cd93 100644 --- a/lib/utfcpp/v3/LICENSE +++ b/lib/utfcpp/v4/LICENSE diff --git a/lib/utfcpp/v3/README.md b/lib/utfcpp/v4/README.md index a519cdb9..624e8481 100644 --- a/lib/utfcpp/v3/README.md +++ b/lib/utfcpp/v4/README.md @@ -1,17 +1,130 @@ + +<!-- TOC --><a name="utf8-cpp-utf-8-with-c-in-a-portable-way"></a> # UTF8-CPP: UTF-8 with C++ in a Portable Way +<!-- TOC --><a name="introduction"></a> ## Introduction -C++ developers miss an easy and portable way of handling Unicode encoded strings. The original C++ Standard (known as C++98 or C++03) is Unicode agnostic. C++11 provides some support for Unicode on core language and library level: u8, u, and U character and string literals, char16_t and char32_t character types, u16string and u32string library classes, and codecvt support for conversions between Unicode encoding forms. In the meantime, developers use third party libraries like ICU, OS specific capabilities, or simply roll out their own solutions. - -In order to easily handle UTF-8 encoded Unicode strings, I came up with a small, C++98 compatible generic library. For anybody used to work with STL algorithms and iterators, it should be easy and natural to use. The code is freely available for any purpose - check out the [license](./LICENSE). The library has been used a lot in the past ten years both in commercial and open-source projects and is considered feature-complete now. If you run into bugs or performance issues, please let me know and I'll do my best to address them. - -The purpose of this article is not to offer an introduction to Unicode in general, and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out [Unicode Home Page](http://www.unicode.org/) or some other source of information for Unicode. Also, it is not my aim to advocate the use of UTF-8 encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from C++, I am sure you have good reasons for it. - +C++ developers still miss an easy and portable way of handling Unicode encoded strings. The original C++ standard (known as C++98 or C++03) is Unicode agnostic. Some progress has been made in the later editions of the standard, but it is still hard to work with Unicode using only the standard facilities. + +I came up with a small, C++98 compatible generic library in order to handle UTF-8 encoded strings. For anybody used to work with STL algorithms and iterators, it should be easy and natural to use. The code is freely available for any purpose - check out the [license](./LICENSE). The library has been used a lot since the first release in 2006 both in commercial and open-source projects and proved to be stable and useful. + +## Table of Contents + +- [UTF8-CPP: UTF-8 with C++ in a Portable Way](#utf8-cpp-utf-8-with-c-in-a-portable-way) + * [Introduction](#introduction) + * [Installation](#installation) + * [Examples of use](#examples-of-use) + + [Introductory Sample](#introductory-sample) + + [Checking if a file contains valid UTF-8 text](#checking-if-a-file-contains-valid-utf-8-text) + + [Ensure that a string contains valid UTF-8 text](#ensure-that-a-string-contains-valid-utf-8-text) + * [Points of interest](#points-of-interest) + - [Design goals and decisions](#design-goals-and-decisions) + - [Alternatives](#alternatives) + * [Reference](#reference) + + [Functions From utf8 Namespace](#functions-from-utf8-namespace) + - [utf8::append](#utf8append) + * [octet_iterator append(utfchar32_t cp, octet_iterator result)](#octet_iterator-appendutfchar32_t-cp-octet_iterator-result) + * [void append(utfchar32_t cp, std::string& s);](#void-appendutfchar32_t-cp-stdstring-s) + - [utf8::append16](#utf8append16) + * [word_iterator append16(utfchar32_t cp, word_iterator result)](#word_iterator-append16utfchar32_t-cp-word_iterator-result) + * [void append(utfchar32_t cp, std::u16string& s)](#void-appendutfchar32_t-cp-stdu16string-s) + - [utf8::next](#utf8next) + - [utf8::next16](#utf8next16) + - [utf8::peek_next](#utf8peek_next) + - [utf8::prior](#utf8prior) + - [utf8::advance](#utf8advance) + - [utf8::distance](#utf8distance) + - [utf8::utf16to8](#utf8utf16to8) + * [octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)](#octet_iterator-utf16to8-u16bit_iterator-start-u16bit_iterator-end-octet_iterator-result) + * [std::string utf16to8(const std::u16string& s)](#stdstring-utf16to8const-stdu16string-s) + * [std::string utf16to8(std::u16string_view s)](#stdstring-utf16to8stdu16string_view-s) + - [utf8::utf16tou8](#utf8utf16tou8) + * [std::u8string utf16tou8(const std::u16string& s)](#stdu8string-utf16tou8const-stdu16string-s) + * [std::u8string utf16tou8(const std::u16string_view& s)](#stdu8string-utf16tou8const-stdu16string_view-s) + - [utf8::utf8to16](#utf8utf8to16) + * [u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)](#u16bit_iterator-utf8to16-octet_iterator-start-octet_iterator-end-u16bit_iterator-result) + * [std::u16string utf8to16(const std::string& s)](#stdu16string-utf8to16const-stdstring-s) + * [std::u16string utf8to16(std::string_view s)](#stdu16string-utf8to16stdstring_view-s) + * [std::u16string utf8to16(std::u8string& s)](#stdu16string-utf8to16stdu8string-s) + * [std::u16string utf8to16(std::u8string_view& s)](#stdu16string-utf8to16stdu8string_view-s) + - [utf8::utf32to8](#utf8utf32to8) + * [octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)](#octet_iterator-utf32to8-u32bit_iterator-start-u32bit_iterator-end-octet_iterator-result) + * [std::string utf32to8(const std::u32string& s)](#stdstring-utf32to8const-stdu32string-s) + * [std::u8string utf32to8(const std::u32string& s)](#stdu8string-utf32to8const-stdu32string-s) + * [std::u8string utf32to8(const std::u32string_view& s)](#stdu8string-utf32to8const-stdu32string_view-s) + * [std::string utf32to8(const std::u32string& s)](#stdstring-utf32to8const-stdu32string-s-1) + * [std::string utf32to8(std::u32string_view s)](#stdstring-utf32to8stdu32string_view-s) + - [utf8::utf8to32](#utf8utf8to32) + * [u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)](#u32bit_iterator-utf8to32-octet_iterator-start-octet_iterator-end-u32bit_iterator-result) + * [std::u32string utf8to32(const std::u8string& s)](#stdu32string-utf8to32const-stdu8string-s) + * [std::u32string utf8to32(const std::u8string_view& s)](#stdu32string-utf8to32const-stdu8string_view-s) + * [std::u32string utf8to32(const std::string& s)](#stdu32string-utf8to32const-stdstring-s) + * [std::u32string utf8to32(std::string_view s)](#stdu32string-utf8to32stdstring_view-s) + - [utf8::find_invalid](#utf8find_invalid) + * [octet_iterator find_invalid(octet_iterator start, octet_iterator end)](#octet_iterator-find_invalidoctet_iterator-start-octet_iterator-end) + * [const char* find_invalid(const char* str)](#const-char-find_invalidconst-char-str) + * [std::size_t find_invalid(const std::string& s)](#stdsize_t-find_invalidconst-stdstring-s) + * [std::size_t find_invalid(std::string_view s)](#stdsize_t-find_invalidstdstring_view-s) + - [utf8::is_valid](#utf8is_valid) + * [bool is_valid(octet_iterator start, octet_iterator end)](#bool-is_validoctet_iterator-start-octet_iterator-end) + * [bool is_valid(const char* str)](#bool-is_validconst-char-str) + * [bool is_valid(const std::string& s)](#bool-is_validconst-stdstring-s) + * [bool is_valid(std::string_view s)](#bool-is_validstdstring_view-s) + - [utf8::replace_invalid](#utf8replace_invalid) + * [output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)](#output_iterator-replace_invalidoctet_iterator-start-octet_iterator-end-output_iterator-out-utfchar32_t-replacement) + * [std::string replace_invalid(const std::string& s, utfchar32_t replacement)](#stdstring-replace_invalidconst-stdstring-s-utfchar32_t-replacement) + * [std::string replace_invalid(std::string_view s, char32_t replacement)](#stdstring-replace_invalidstdstring_view-s-char32_t-replacement) + - [utf8::starts_with_bom](#utf8starts_with_bom) + * [bool starts_with_bom (octet_iterator it, octet_iterator end)](#bool-starts_with_bom-octet_iterator-it-octet_iterator-end) + * [bool starts_with_bom(const std::string& s)](#bool-starts_with_bomconst-stdstring-s) + * [bool starts_with_bom(std::string_view s)](#bool-starts_with_bomstdstring_view-s) + + [Types From utf8 Namespace](#types-from-utf8-namespace) + - [utf8::exception](#utf8exception) + - [utf8::invalid_code_point](#utf8invalid_code_point) + - [utf8::invalid_utf8](#utf8invalid_utf8) + - [utf8::invalid_utf16](#utf8invalid_utf16) + - [utf8::not_enough_room](#utf8not_enough_room) + - [utf8::iterator](#utf8iterator) + * [Member functions](#member-functions) + + [Functions From utf8::unchecked Namespace](#functions-from-utf8unchecked-namespace) + - [utf8::unchecked::append](#utf8uncheckedappend) + - [utf8::unchecked::append16](#utf8uncheckedappend16) + - [utf8::unchecked::next](#utf8uncheckednext) + - [utf8::next16](#utf8next16-1) + - [utf8::unchecked::peek_next](#utf8uncheckedpeek_next) + - [utf8::unchecked::prior](#utf8uncheckedprior) + - [utf8::unchecked::advance](#utf8uncheckedadvance) + - [utf8::unchecked::distance](#utf8uncheckeddistance) + - [utf8::unchecked::utf16to8](#utf8uncheckedutf16to8) + - [utf8::unchecked::utf8to16](#utf8uncheckedutf8to16) + - [utf8::unchecked::utf32to8](#utf8uncheckedutf32to8) + - [utf8::unchecked::utf8to32](#utf8uncheckedutf8to32) + - [utf8::unchecked::replace_invalid](#utf8uncheckedreplace_invalid) + + [Types From utf8::unchecked Namespace](#types-from-utf8unchecked-namespace) + - [utf8::iterator](#utf8iterator-1) + * [Member functions](#member-functions-1) + +<!-- TOC end --> + + +<!-- TOC --><a name="installation"></a> +## Installation + +This is a header-only library and the supported way of deploying it is: +- Download a release from https://github.com/nemtrif/utfcpp/releases into a temporary directory +- Unzip the release +- Copy the content of utfcpp/source file into the directory where you keep include files for your project + + +The CMakeList.txt file was originally made for testing purposes only, but unfortunatelly over time I accepted contributions that added install target. *This is not a supported way of installing the utfcpp library* and I am considering removing the CMakeList.txt in a future release. + +<!-- TOC --><a name="examples-of-use"></a> ## Examples of use -### Introductionary Sample +<!-- TOC --><a name="introductory-sample"></a> +### Introductory Sample To illustrate the use of the library, let's start with a small but complete program that opens a file containing UTF-8 encoded text, reads it line by line, checks each line for invalid UTF-8 byte sequences, and converts it to UTF-16 encoding and back to UTF-8: @@ -100,6 +213,7 @@ In case you do not trust the `__cplusplus` macro or, for instance, do not want t the C++ 11 helper functions even with a modern compiler, define `UTF_CPP_CPLUSPLUS` macro before including `utf8.h` and assign it a value for the standard you want to use - the values are the same as for the `__cplusplus` macro. This can be also useful with compilers that are conservative in setting the `__cplusplus` macro even if they have a good support for a recent standard edition - Microsoft's Visual C++ is one example. +<!-- TOC --><a name="checking-if-a-file-contains-valid-utf-8-text"></a> ### Checking if a file contains valid UTF-8 text Here is a function that checks whether the content of a file is valid UTF-8 encoded text without reading the content into the memory: @@ -126,6 +240,7 @@ Note that other functions that take input iterator arguments can be used in a si utf8::utf8to16(it, eos, back_inserter(u16string)); ``` +<!-- TOC --><a name="ensure-that-a-string-contains-valid-utf-8-text"></a> ### Ensure that a string contains valid UTF-8 text If we have some text that "probably" contains UTF-8 encoded text and we want to replace any invalid UTF-8 sequence with a replacement character, something like the following function may be used: @@ -142,39 +257,82 @@ void fix_utf8_string(std::string& str) The function will replace any invalid UTF-8 sequence with a Unicode replacement character. There is an overloaded function that enables the caller to supply their own replacement character. +<!-- TOC --><a name="points-of-interest"></a> ## Points of interest +<!-- TOC --><a name="design-goals-and-decisions"></a> #### Design goals and decisions The library was designed to be: 1. Generic: for better or worse, there are many C++ string classes out there, and the library should work with as many of them as possible. -2. Portable: the library should be portable both accross different platforms and compilers. The only non-portable code is a small section that declares unsigned integers of different sizes: three typedefs. They can be changed by the users of the library if they don't match their platform. The default setting should work for Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix derivatives. Support for post C++03 language features is included for modern compilers at API level only, so the library should work even with pretty old compilers. +2. Portable: the library should be portable both across different platforms and compilers. The only non-portable code is a small section that declares unsigned integers of different sizes: three typedefs. They can be changed by the users of the library if they don't match their platform. The default setting should work for Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix derivatives. Support for post C++03 language features is included for modern compilers at API level only, so the library should work even with pretty old compilers. 3. Lightweight: follow the "pay only for what you use" guideline. 4. Unintrusive: avoid forcing any particular design or even programming style on the user. This is a library, not a framework. +<!-- TOC --><a name="alternatives"></a> #### Alternatives -In case you want to look into other means of working with UTF-8 strings from C++, here is the list of solutions I am aware of: +For alternatives and comparisons, I recommend the following article: [The Wonderfully Terrible World of C and C++ Encoding APIs (with Some Rust)](https://thephd.dev/the-c-c++-rust-string-text-encoding-api-landscape), by JeanHeyd Meneide. In the article, this library is compared with: -1. [ICU Library](http://icu.sourceforge.net/). It is very powerful, complete, feature-rich, mature, and widely used. Also big, intrusive, non-generic, and doesn't play well with the Standard Library. I definitelly recommend looking at ICU even if you don't plan to use it. -2. C++11 language and library features. Still far from complete, and not easy to use. -3. [Glib::ustring](http://www.gtkmm.org/gtkmm2/docs/tutorial/html/ch03s04.html). A class specifically made to work with UTF-8 strings, and also feel like `std::string`. If you prefer to have yet another string class in your code, it may be worth a look. Be aware of the licensing issues, though. -4. Platform dependent solutions: Windows and POSIX have functions to convert strings from one encoding to another. That is only a subset of what my library offers, but if that is all you need it may be good enough. +- [simdutf](https://github.com/simdutf/simdutf) +- [iconv](https://www.gnu.org/software/libiconv/) +- [boost.text](https://github.com/tzlaine/text) +- [ICU](https://unicode-org.github.io/icu/userguide/conversion/converters.html) +- [encoding_rs](https://github.com/hsivonen/encoding_rs) +- [Windows API functions for converting text between encodings](https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/) +- [ztd.text](https://github.com/soasis/text/) +The article presents author's view of the quality of the API design, but also some speed benchmarks. +<!-- TOC --><a name="reference"></a> ## Reference +<!-- TOC --><a name="functions-from-utf8-namespace"></a> ### Functions From utf8 Namespace +<!-- TOC --><a name="utf8append"></a> #### utf8::append -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. +<!-- TOC --><a name="octet_iterator-appendutfchar32_t-cp-octet_iterator-result"></a> +##### octet_iterator append(utfchar32_t cp, octet_iterator result) + +Available in version 1.0 and later. + +Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. + +```cpp +template <typename octet_iterator> +octet_iterator append(utfchar32_t cp, octet_iterator result); +``` + +`octet_iterator`: an output iterator. +`cp`: a 32 bit integer representing a code point to append to the sequence. +`result`: an output iterator to the place in the sequence where to append the code point. +Return value: an iterator pointing to the place after the newly appended sequence. + +Example of use: + +```cpp +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +``` + +Note that `append` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append` can add anywhere between 1 and 4 octets to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + + +<!-- TOC --><a name="void-appendutfchar32_t-cp-stdstring-s"></a> +##### void append(utfchar32_t cp, std::string& s); + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0. Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. ```cpp -void append(char32_t cp, std::string& s); +void append(utfchar32_t cp, std::string& s); ``` `cp`: a code point to append to the string. @@ -190,19 +348,21 @@ assert (u[0] == char(0xd1) && u[1] == char(0x88) && u.length() == 2); In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. +<!-- TOC --><a name="utf8append16"></a> +#### utf8::append16 +<!-- TOC --><a name="word_iterator-append16utfchar32_t-cp-word_iterator-result"></a> +##### word_iterator append16(utfchar32_t cp, word_iterator result) -#### utf8::append +Available in version 4.0 and later. -Available in version 1.0 and later. - -Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. ```cpp -template <typename octet_iterator> -octet_iterator append(uint32_t cp, octet_iterator result); +template <typename word_iterator> +word_iterator append16(utfchar32_t cp, word_iterator result); ``` -`octet_iterator`: an output iterator. +`word_iterator`: an output iterator. `cp`: a 32 bit integer representing a code point to append to the sequence. `result`: an output iterator to the place in the sequence where to append the code point. Return value: an iterator pointing to the place after the newly appended sequence. @@ -210,15 +370,42 @@ Return value: an iterator pointing to the place after the newly appended sequenc Example of use: ```cpp -unsigned char u[5] = {0,0,0,0,0}; -unsigned char* end = append(0x0448, u); -assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +unsigned short u[2] = {0,0}; +unsigned short* end = append16(0x0448, u); +assert (u[0] == 0x0448 && u[1] == 0); ``` -Note that `append` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append` can add anywhere between 1 and 4 octets to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. +Note that `append16` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append16` can add either one or two words to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + +<!-- TOC --><a name="void-appendutfchar32_t-cp-stdu16string-s"></a> +##### void append(utfchar32_t cp, std::u16string& s) + +Available in version 4.0 and later. Requires a C++11 compliant compiler. + +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. + +```cpp +void append(utfchar32_t cp, std::u16string& s); +``` + +`cp`: a code point to append to the string. +`s`: a utf-16 encoded string to append the code point to. + +Example of use: + +```cpp +std::u16string u; +append(0x0448, u); +assert (u[0] == 0x0448 && u.length() == 1); +``` + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + + +<!-- TOC --><a name="utf8next"></a> #### utf8::next Available in version 1.0 and later. @@ -227,7 +414,7 @@ Given the iterator to the beginning of the UTF-8 sequence, it returns the code p ```cpp template <typename octet_iterator> -uint32_t next(octet_iterator& it, octet_iterator end); +utfchar32_t next(octet_iterator& it, octet_iterator end); ``` `octet_iterator`: an input iterator. @@ -247,8 +434,41 @@ assert (w == twochars + 3); This function is typically used to iterate through a UTF-8 encoded string. -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. + +<!-- TOC --><a name="utf8next16"></a> +#### utf8::next16 + +Available in version 4.0 and later. + +Given the iterator to the beginning of the UTF-16 sequence, it returns the code point and moves the iterator to the next position. + +```cpp +template <typename word_iterator> +utfchar32_t next16(word_iterator& it, word_iterator end); +``` + +`word_iterator`: an input iterator. +`it`: a reference to an iterator pointing to the beginning of an UTF-16 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. +`end`: end of the UTF-16 sequence to be processed. If `it` gets equal to `end` during the extraction of a code point, an `utf8::not_enough_room` exception is thrown. +Return value: the 32 bit representation of the processed UTF-16 code point. + +Example of use: + +```cpp +const unsigned short u[3] = {0x65e5, 0xd800, 0xdf46}; +const unsigned short* w = u; +int cp = next16(w, w + 3); +assert (cp, 0x65e5); +assert (w, u + 1); +``` + +This function is typically used to iterate through a UTF-16 encoded string. +In case of an invalid UTF-16 sequence, a `utf8::invalid_utf8` exception is thrown. + + +<!-- TOC --><a name="utf8peek_next"></a> #### utf8::peek_next Available in version 2.1 and later. @@ -257,7 +477,7 @@ Given the iterator to the beginning of the UTF-8 sequence, it returns the code p ```cpp template <typename octet_iterator> -uint32_t peek_next(octet_iterator it, octet_iterator end); +utfchar32_t peek_next(octet_iterator it, octet_iterator end); ``` @@ -276,8 +496,9 @@ assert (cp == 0x65e5); assert (w == twochars); ``` -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. +<!-- TOC --><a name="utf8prior"></a> #### utf8::prior Available in version 1.02 and later. @@ -286,7 +507,7 @@ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it de ```cpp template <typename octet_iterator> -uint32_t prior(octet_iterator& it, octet_iterator start); +utfchar32_t prior(octet_iterator& it, octet_iterator start); ``` `octet_iterator`: a bidirectional iterator. @@ -312,6 +533,7 @@ In case `start` is reached before a UTF-8 lead octet is hit, or if an invalid UT In case `start` equals `it`, a `not_enough_room` exception is thrown. +<!-- TOC --><a name="utf8advance"></a> #### utf8::advance Available in version 1.0 and later. @@ -341,11 +563,12 @@ assert (w == twochars); In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. +<!-- TOC --><a name="utf8distance"></a> #### utf8::distance Available in version 1.0 and later. -Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them. +Given the iterators to two UTF-8 encoded code points in a sequence, returns the number of code points between them. ```cpp template <typename octet_iterator> @@ -367,9 +590,43 @@ assert (dist == 2); This function is used to find the length (in code points) of a UTF-8 encoded string. The reason it is called _distance_, rather than, say, _length_ is mainly because developers are used that _length_ is an O(1) function. Computing the length of an UTF-8 string is a linear operation, and it looked better to model it after `std::distance` algorithm. -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `last` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown. +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `last` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. +<!-- TOC --><a name="utf8utf16to8"></a> #### utf8::utf16to8 +<!-- TOC --><a name="octet_iterator-utf16to8-u16bit_iterator-start-u16bit_iterator-end-octet_iterator-result"></a> +##### octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + +Available in version 1.0 and later. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +template <typename u16bit_iterator, typename octet_iterator> +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +``` + +`u16bit_iterator`: an input iterator. +`octet_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-16 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-16 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; +utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + + +<!-- TOC --><a name="stdstring-utf16to8const-stdu16string-s"></a> +##### std::string utf16to8(const std::u16string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -392,7 +649,8 @@ Example of use: In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. -#### utf8::utf16to8 +<!-- TOC --><a name="stdstring-utf16to8stdu16string_view-s"></a> +##### std::string utf16to8(std::u16string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -416,37 +674,95 @@ Example of use: In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. +<!-- TOC --><a name="utf8utf16tou8"></a> +#### utf8::utf16tou8 +<!-- TOC --><a name="stdu8string-utf16tou8const-stdu16string-s"></a> +##### std::u8string utf16tou8(const std::u16string& s) -#### utf8::utf16to8 +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. -Available in version 1.0 and later. +Converts a UTF-16 encoded string to UTF-8. + +```cpp +std::u8string utf16tou8(const std::u16string& s); +``` + +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. + +Example of use: + +```cpp + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u8string u = utf16tou8(utf16string); + assert (u.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + +<!-- TOC --><a name="stdu8string-utf16tou8const-stdu16string_view-s"></a> +##### std::u8string utf16tou8(const std::u16string_view& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. Converts a UTF-16 encoded string to UTF-8. ```cpp -template <typename u16bit_iterator, typename octet_iterator> -octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +std::u8string utf16tou8(const std::u16string_view& s); ``` -`u16bit_iterator`: an input iterator. -`octet_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-16 encoded string to convert. -`end`: an iterator pointing to pass-the-end of the UTF-16 encoded string to convert. -`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-8 string. +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. Example of use: ```cpp -unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector<unsigned char> utf8result; -utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); -assert (utf8result.size() == 10); + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview(u16string); + u8string u = utf16tou8(utf16string); + assert (u.size() == 10); ``` In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. +<!-- TOC --><a name="utf8utf8to16"></a> #### utf8::utf8to16 +<!-- TOC --><a name="u16bit_iterator-utf8to16-octet_iterator-start-octet_iterator-end-u16bit_iterator-result"></a> +##### u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + +Available in version 1.0 and later. + +Converts an UTF-8 encoded string to UTF-16 + +```cpp +template <typename u16bit_iterator, typename octet_iterator> +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); +``` + +`octet_iterator`: an input iterator. +`u16bit_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-16 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-16 string. + +Example of use: + +```cpp +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; +utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. + + + +<!-- TOC --><a name="stdu16string-utf8to16const-stdstring-s"></a> +##### std::u16string utf8to16(const std::string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -469,9 +785,11 @@ assert (utf16result[2] == 0xd834); assert (utf16result[3] == 0xdd1e); ``` -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to16 + +<!-- TOC --><a name="stdu16string-utf8to16stdstring_view-s"></a> +##### std::u16string utf8to16(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -494,40 +812,98 @@ assert (utf16result[2] == 0xd834); assert (utf16result[3] == 0xdd1e); ``` +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. + + +<!-- TOC --><a name="stdu16string-utf8to16stdu8string-s"></a> +##### std::u16string utf8to16(std::u8string& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts an UTF-8 encoded string to UTF-16. + +```cpp +std::u16string utf8to16(std::u8string& s); +``` + +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string + +Example of use: + +```cpp +std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +std::u16string utf16result = utf8to16(utf8_with_surrogates); +assert (utf16result.length() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to16 +<!-- TOC --><a name="stdu16string-utf8to16stdu8string_view-s"></a> +##### std::u16string utf8to16(std::u8string_view& s) -Available in version 1.0 and later. +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. -Converts an UTF-8 encoded string to UTF-16 +Converts an UTF-8 encoded string to UTF-16. ```cpp -template <typename u16bit_iterator, typename octet_iterator> -u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); +std::u16string utf8to16(std::u8string_view& s); ``` -`octet_iterator`: an input iterator. -`u16bit_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> `end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. -`result`: an output iterator to the place in the UTF-16 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-16 string. +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string Example of use: ```cpp -char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector <unsigned short> utf16result; -utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); -assert (utf16result.size() == 4); +std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +std::u8string_view utf8stringview {utf8_with_surrogates} +std::u16string utf16result = utf8to16(utf8stringview); +assert (utf16result.length() == 4); assert (utf16result[2] == 0xd834); assert (utf16result[3] == 0xdd1e); ``` -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown. +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +<!-- TOC --><a name="utf8utf32to8"></a> #### utf8::utf32to8 +<!-- TOC --><a name="octet_iterator-utf32to8-u32bit_iterator-start-u32bit_iterator-end-octet_iterator-result"></a> +##### octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + +Available in version 1.0 and later. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +template <typename octet_iterator, typename u32bit_iterator> +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); +``` + +`octet_iterator`: an output iterator. +`u32bit_iterator`: an input iterator. +`start`: an iterator pointing to the beginning of the UTF-32 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-32 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; +vector<unsigned char> utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + +<!-- TOC --><a name="stdstring-utf32to8const-stdu32string-s"></a> +##### std::string utf32to8(const std::u32string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -550,7 +926,83 @@ assert (utf8result.size() == 9); In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. -#### utf8::utf32to8 +<!-- TOC --><a name="stdu8string-utf32to8const-stdu32string-s"></a> +##### std::u8string utf32to8(const std::u32string& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::u8string utf32to8(const std::u32string& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u8string utf8result = utf32to8(utf32string); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + +<!-- TOC --><a name="stdu8string-utf32to8const-stdu32string_view-s"></a> +##### std::u8string utf32to8(const std::u32string_view& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::u8string utf32to8(const std::u32string_view& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u32string_view utf32stringview(utf32string); +u8string utf8result = utf32to8(utf32stringview); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + +<!-- TOC --><a name="stdstring-utf32to8const-stdu32string-s-1"></a> +##### std::string utf32to8(const std::u32string& s) + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::string utf32to8(const std::u32string& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +string utf8result = utf32to8(utf32string); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + +<!-- TOC --><a name="stdstring-utf32to8stdu32string_view-s"></a> +##### std::string utf32to8(std::u32string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -575,36 +1027,93 @@ assert (utf8result.size() == 9); In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. -#### utf8::utf32to8 +<!-- TOC --><a name="utf8utf8to32"></a> +#### utf8::utf8to32 +<!-- TOC --><a name="u32bit_iterator-utf8to32-octet_iterator-start-octet_iterator-end-u32bit_iterator-result"></a> +##### u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) Available in version 1.0 and later. -Converts a UTF-32 encoded string to UTF-8. +Converts a UTF-8 encoded string to UTF-32. ```cpp template <typename octet_iterator, typename u32bit_iterator> -octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); ``` -`octet_iterator`: an output iterator. -`u32bit_iterator`: an input iterator. -`start`: an iterator pointing to the beginning of the UTF-32 encoded string to convert. -`end`: an iterator pointing to pass-the-end of the UTF-32 encoded string to convert. -`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-8 string. +`octet_iterator`: an input iterator. +`u32bit_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-32 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-32 string. Example of use: ```cpp -int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector<unsigned char> utf8result; -utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); -assert (utf8result.size() == 9); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; +utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2); ``` -In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. -#### utf8::utf8to32 + + +<!-- TOC --><a name="stdu32string-utf8to32const-stdu8string-s"></a> +##### std::u32string utf8to32(const std::u8string& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +std::u32string utf8to32(const std::u8string& s); +``` + +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. + +Example of use: + +```cpp +const std::u8string* twochars = u8"\xe6\x97\xa5\xd1\x88"; +u32string utf32result = utf8to32(twochars); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + +<!-- TOC --><a name="stdu32string-utf8to32const-stdu8string_view-s"></a> +##### std::u32string utf8to32(const std::u8string_view& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +std::u32string utf8to32(const std::u8string_view& s); +``` + +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. + +Example of use: + +```cpp +const u8string* twochars = u8"\xe6\x97\xa5\xd1\x88"; +const u8string_view stringview{twochars}; +u32string utf32result = utf8to32(stringview); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + +<!-- TOC --><a name="stdu32string-utf8to32const-stdstring-s"></a> +##### std::u32string utf8to32(const std::string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -625,9 +1134,10 @@ u32string utf32result = utf8to32(twochars); assert (utf32result.size() == 2); ``` -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to32 +<!-- TOC --><a name="stdu32string-utf8to32stdstring_view-s"></a> +##### std::u32string utf8to32(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -648,41 +1158,66 @@ u32string utf32result = utf8to32(twochars); assert (utf32result.size() == 2); ``` -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. - +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to32 +<!-- TOC --><a name="utf8find_invalid"></a> +#### utf8::find_invalid +<!-- TOC --><a name="octet_iterator-find_invalidoctet_iterator-start-octet_iterator-end"></a> +##### octet_iterator find_invalid(octet_iterator start, octet_iterator end) Available in version 1.0 and later. -Converts a UTF-8 encoded string to UTF-32. +Detects an invalid sequence within a UTF-8 string. ```cpp -template <typename octet_iterator, typename u32bit_iterator> -u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); +template <typename octet_iterator> +octet_iterator find_invalid(octet_iterator start, octet_iterator end); ``` `octet_iterator`: an input iterator. -`u32bit_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. -`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. -`result`: an output iterator to the place in the UTF-32 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-32 string. +`start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. +Return value: an iterator pointing to the first invalid octet in the UTF-8 string. In case none were found, equals `end`. Example of use: ```cpp -char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector<int> utf32result; -utf8to32(twochars, twochars + 5, back_inserter(utf32result)); -assert (utf32result.size() == 2); +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +char* invalid = find_invalid(utf_invalid, utf_invalid + 6); +assert (invalid == utf_invalid + 5); ``` -In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown. +This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. -#### utf8::find_invalid -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. +<!-- TOC --><a name="const-char-find_invalidconst-char-str"></a> +##### const char* find_invalid(const char* str) + +Available in version 4.0 and later. + +Detects an invalid sequence within a C-style UTF-8 string. + +```cpp +const char* find_invalid(const char* str); +``` + +`str`: a UTF-8 encoded string. +Return value: a pointer to the first invalid octet in the UTF-8 string. In case none were found, points to the trailing zero byte. + +Example of use: + +```cpp +const char* utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; +const char* invalid = find_invalid(utf_invalid); +assert ((invalid - utf_invalid) == 5); +``` + +This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. + +<!-- TOC --><a name="stdsize_t-find_invalidconst-stdstring-s"></a> +##### std::size_t find_invalid(const std::string& s) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Detects an invalid sequence within a UTF-8 string. @@ -703,7 +1238,8 @@ assert (invalid == 5); This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. -#### utf8::find_invalid +<!-- TOC --><a name="stdsize_t-find_invalidstdstring_view-s"></a> +##### std::size_t find_invalid(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -726,36 +1262,65 @@ assert (invalid == 5); This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. - -#### utf8::find_invalid +<!-- TOC --><a name="utf8is_valid"></a> +#### utf8::is_valid +<!-- TOC --><a name="bool-is_validoctet_iterator-start-octet_iterator-end"></a> +##### bool is_valid(octet_iterator start, octet_iterator end) Available in version 1.0 and later. -Detects an invalid sequence within a UTF-8 string. +Checks whether a sequence of octets is a valid UTF-8 string. ```cpp template <typename octet_iterator> -octet_iterator find_invalid(octet_iterator start, octet_iterator end); +bool is_valid(octet_iterator start, octet_iterator end); ``` `octet_iterator`: an input iterator. `start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. `end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. -Return value: an iterator pointing to the first invalid octet in the UTF-8 string. In case none were found, equals `end`. +Return value: `true` if the sequence is a valid UTF-8 string; `false` if not. Example of use: ```cpp char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; -char* invalid = find_invalid(utf_invalid, utf_invalid + 6); -assert (invalid == utf_invalid + 5); +bool bvalid = is_valid(utf_invalid, utf_invalid + 6); +assert (bvalid == false); ``` -This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. +`is_valid` is a shorthand for `find_invalid(start, end) == end;`. You may want to use it to make sure that a byte sequence is a valid UTF-8 string without the need to know where it fails if it is not valid. -#### utf8::is_valid -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. +<!-- TOC --><a name="bool-is_validconst-char-str"></a> +##### bool is_valid(const char* str) + +Available in version 4.0 and later. + +Checks whether a C-style string contains valid UTF-8 encoded text. + +```cpp +bool is_valid(const char* str); +``` + +`str`: a UTF-8 encoded string. +Return value: `true` if the string contains valid UTF-8 encoded text; `false` if not. + +Example of use: + +```cpp +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid); +assert (bvalid == false); +``` + +You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. + + +<!-- TOC --><a name="bool-is_validconst-stdstring-s"></a> +##### bool is_valid(const std::string& s) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Checks whether a string object contains valid UTF-8 encoded text. @@ -776,7 +1341,8 @@ assert (bvalid == false); You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. -#### utf8::is_valid +<!-- TOC --><a name="bool-is_validstdstring_view-s"></a> +##### bool is_valid(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -799,41 +1365,54 @@ assert (bvalid == false); You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. +<!-- TOC --><a name="utf8replace_invalid"></a> +#### utf8::replace_invalid +<!-- TOC --><a name="output_iterator-replace_invalidoctet_iterator-start-octet_iterator-end-output_iterator-out-utfchar32_t-replacement"></a> +##### output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) -#### utf8::is_valid - -Available in version 1.0 and later. +Available in version 2.0 and later. -Checks whether a sequence of octets is a valid UTF-8 string. +Replaces all invalid UTF-8 sequences within a string with a replacement marker. ```cpp -template <typename octet_iterator> -bool is_valid(octet_iterator start, octet_iterator end); +template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement); +template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); ``` `octet_iterator`: an input iterator. -`start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. -`end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. -Return value: `true` if the sequence is a valid UTF-8 string; `false` if not. +`output_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences. +`out`: An output iterator to the range where the result of replacement is stored. +`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` +Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences. Example of use: ```cpp -char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; -bool bvalid = is_valid(utf_invalid, utf_invalid + 6); -assert (bvalid == false); +char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector<char> replace_invalid_result; +replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); +bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); +assert (bvalid); +char* fixed_invalid_sequence = "a????z"; +assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); ``` -`is_valid` is a shorthand for `find_invalid(start, end) == end;`. You may want to use it to make sure that a byte seqence is a valid UTF-8 string without the need to know where it fails if it is not valid. +`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range. -#### utf8::replace_invalid -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. +<!-- TOC --><a name="stdstring-replace_invalidconst-stdstring-s-utfchar32_t-replacement"></a> +##### std::string replace_invalid(const std::string& s, utfchar32_t replacement) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Replaces all invalid UTF-8 sequences within a string with a replacement marker. ```cpp -std::string replace_invalid(const std::string& s, char32_t replacement); +std::string replace_invalid(const std::string& s, utfchar32_t replacement); std::string replace_invalid(const std::string& s); ``` @@ -852,7 +1431,8 @@ const string fixed_invalid_sequence = "a????z"; assert (fixed_invalid_sequence == replace_invalid_result); ``` -#### utf8::replace_invalid +<!-- TOC --><a name="stdstring-replace_invalidstdstring_view-s-char32_t-replacement"></a> +##### std::string replace_invalid(std::string_view s, char32_t replacement) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -878,45 +1458,40 @@ const string fixed_invalid_sequence = "a????z"; assert(fixed_invalid_sequence, replace_invalid_result); ``` +<!-- TOC --><a name="utf8starts_with_bom"></a> +#### utf8::starts_with_bom +<!-- TOC --><a name="bool-starts_with_bom-octet_iterator-it-octet_iterator-end"></a> +##### bool starts_with_bom (octet_iterator it, octet_iterator end) -#### utf8::replace_invalid - -Available in version 2.0 and later. +Available in version 2.3 and later. -Replaces all invalid UTF-8 sequences within a string with a replacement marker. +Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM) ```cpp -template <typename octet_iterator, typename output_iterator> -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); -template <typename octet_iterator, typename output_iterator> -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); +template <typename octet_iterator> +bool starts_with_bom (octet_iterator it, octet_iterator end); ``` `octet_iterator`: an input iterator. -`output_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences. -`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences. -`out`: An output iterator to the range where the result of replacement is stored. -`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` -Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences. +`it`: beginning of the octet sequence to check +`end`: pass-end of the sequence to check +Return value: `true` if the sequence starts with a UTF-8 byte order mark; `false` if not. Example of use: ```cpp -char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; -vector<char> replace_invalid_result; -replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); -bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); -assert (bvalid); -char* fixed_invalid_sequence = "a????z"; -assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); +unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; +bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); +assert (bbom == true); ``` -`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range. +The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. -#### utf8::starts_with_bom -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. +<!-- TOC --><a name="bool-starts_with_bomconst-stdstring-s"></a> +##### bool starts_with_bom(const std::string& s) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Checks whether a string starts with a UTF-8 byte order mark (BOM) @@ -941,7 +1516,8 @@ assert (no_bbom == false); The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. -#### utf8::starts_with_bom +<!-- TOC --><a name="bool-starts_with_bomstdstring_view-s"></a> +##### bool starts_with_bom(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -969,34 +1545,10 @@ assert (!no_bbom); The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. -#### utf8::starts_with_bom - -Available in version 2.3 and later. - -Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM) - -```cpp -template <typename octet_iterator> -bool starts_with_bom (octet_iterator it, octet_iterator end); -``` - -`octet_iterator`: an input iterator. -`it`: beginning of the octet sequence to check -`end`: pass-end of the sequence to check -Return value: `true` if the sequence starts with a UTF-8 byte order mark; `false` if not. - -Example of use: - -```cpp -unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; -bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); -assert (bbom == true); -``` - -The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. - +<!-- TOC --><a name="types-from-utf8-namespace"></a> ### Types From utf8 Namespace +<!-- TOC --><a name="utf8exception"></a> #### utf8::exception Available in version 2.3 and later. @@ -1018,6 +1570,7 @@ catch(const utf8::exception& utfcpp_ex) { } ``` +<!-- TOC --><a name="utf8invalid_code_point"></a> #### utf8::invalid_code_point Available in version 1.0 and later. @@ -1027,12 +1580,13 @@ Thrown by UTF8 CPP functions such as `advance` and `next` if an UTF-8 sequence r ```cpp class invalid_code_point : public exception { public: - uint32_t code_point() const; + utfchar32_t code_point() const; }; ``` Member function `code_point()` can be used to determine the invalid code point that caused the exception to be thrown. +<!-- TOC --><a name="utf8invalid_utf8"></a> #### utf8::invalid_utf8 Available in version 1.0 and later. @@ -1042,12 +1596,13 @@ Thrown by UTF8 CPP functions such as `next` and `prior` if an invalid UTF-8 sequ ```cpp class invalid_utf8 : public exception { public: - uint8_t utf8_octet() const; + utfchar8_t utf8_octet() const; }; ``` Member function `utf8_octet()` can be used to determine the beginning of the byte sequence that caused the exception to be thrown. +<!-- TOC --><a name="utf8invalid_utf16"></a> #### utf8::invalid_utf16 Available in version 1.0 and later. @@ -1057,12 +1612,13 @@ Thrown by UTF8 CPP function `utf16to8` if an invalid UTF-16 sequence is detected ```cpp class invalid_utf16 : public exception { public: - uint16_t utf16_word() const; + utfchar16_t utf16_word() const; }; ``` Member function `utf16_word()` can be used to determine the UTF-16 code unit that caused the exception to be thrown. +<!-- TOC --><a name="utf8not_enough_room"></a> #### utf8::not_enough_room Available in version 1.0 and later. @@ -1073,6 +1629,7 @@ Thrown by UTF8 CPP functions such as `next` if the end of the decoded UTF-8 sequ class not_enough_room : public exception {}; ``` +<!-- TOC --><a name="utf8iterator"></a> #### utf8::iterator Available in version 2.0 and later. @@ -1084,6 +1641,7 @@ template <typename octet_iterator> class iterator; ``` +<!-- TOC --><a name="member-functions"></a> ##### Member functions `iterator();` the deafult constructor; the underlying octet_iterator is constructed with its default constructor. @@ -1092,11 +1650,11 @@ class iterator; `octet_iterator base () const;` returns the underlying octet_iterator. -`uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. +`utfchar32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. -`bool operator == (const iterator& rhs) const;` returns `true` if the two underlaying iterators are equal. +`bool operator == (const iterator& rhs) const;` returns `true` if the two underlying iterators are equal. -`bool operator != (const iterator& rhs) const;` returns `true` if the two underlaying iterators are not equal. +`bool operator != (const iterator& rhs) const;` returns `true` if the two underlying iterators are not equal. `iterator& operator ++ ();` the prefix increment - moves the iterator to the next UTF-8 encoded code point. @@ -1136,8 +1694,10 @@ std::string s = "example"; utf8::iterator i (s.begin(), s.begin(), s.end()); ``` +<!-- TOC --><a name="functions-from-utf8unchecked-namespace"></a> ### Functions From utf8::unchecked Namespace +<!-- TOC --><a name="utf8uncheckedappend"></a> #### utf8::unchecked::append Available in version 1.0 and later. @@ -1146,7 +1706,7 @@ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequen ```cpp template <typename octet_iterator> -octet_iterator append(uint32_t cp, octet_iterator result); +octet_iterator append(utfchar32_t cp, octet_iterator result); ``` `cp`: A 32 bit integer representing a code point to append to the sequence. @@ -1163,6 +1723,35 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); This is a faster but less safe version of `utf8::append`. It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence. +<!-- TOC --><a name="utf8uncheckedappend16"></a> +#### utf8::unchecked::append16 + +Available in version 4.0 and later. + +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. + +```cpp +template <typename word_iterator> +word_iterator append16(utfchar32_t cp, word_iterator result) +``` + +`cp`: A 32 bit integer representing a code point to append to the sequence. +`result`: An output iterator to the place in the sequence where to append the code point. +Return value: An iterator pointing to the place after the newly appended sequence. + +Example of use: + +```cpp +unsigned short u[5] = {0,0}; +utf8::unchecked::append16(0x0448, u); +assert(u[0], 0x0448); +assert(u[1], 0x0000); +``` + +This is a faster but less safe version of `utf8::append`. It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence. + + +<!-- TOC --><a name="utf8uncheckednext"></a> #### utf8::unchecked::next Available in version 1.0 and later. @@ -1171,7 +1760,7 @@ Given the iterator to the beginning of a UTF-8 sequence, it returns the code poi ```cpp template <typename octet_iterator> -uint32_t next(octet_iterator& it); +utfchar32_t next(octet_iterator& it); ``` `it`: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. @@ -1189,6 +1778,39 @@ assert (w == twochars + 3); This is a faster but less safe version of `utf8::next`. It does not check for validity of the supplied UTF-8 sequence. +<!-- TOC --><a name="utf8next16-1"></a> +#### utf8::next16 + +Available in version 4.0 and later. + +Given the iterator to the beginning of the UTF-16 sequence, it returns the code point and moves the iterator to the next position. + +```cpp +template <typename word_iterator> +utfchar32_t next16(word_iterator& it); +``` + +`word_iterator`: an input iterator. +`it`: a reference to an iterator pointing to the beginning of an UTF-16 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. + +Return value: the 32 bit representation of the processed UTF-16 code point. + +Example of use: + +```cpp +const unsigned short u[3] = {0x65e5, 0xd800, 0xdf46}; +const unsigned short* w = u; +int cp = unchecked::next16(w); +assert (cp, 0x65e5); +assert (w, u + 1); +``` + +This function is typically used to iterate through a UTF-16 encoded string. + +This is a faster but less safe version of `utf8::next16`. It does not check for validity of the supplied UTF-8 sequence. + + +<!-- TOC --><a name="utf8uncheckedpeek_next"></a> #### utf8::unchecked::peek_next Available in version 2.1 and later. @@ -1197,7 +1819,7 @@ Given the iterator to the beginning of a UTF-8 sequence, it returns the code poi ```cpp template <typename octet_iterator> -uint32_t peek_next(octet_iterator it); +utfchar32_t peek_next(octet_iterator it); ``` `it`: an iterator pointing to the beginning of an UTF-8 encoded code point. @@ -1215,15 +1837,16 @@ assert (w == twochars); This is a faster but less safe version of `utf8::peek_next`. It does not check for validity of the supplied UTF-8 sequence. +<!-- TOC --><a name="utf8uncheckedprior"></a> #### utf8::unchecked::prior Available in version 1.02 and later. -Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point. +Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point. ```cpp template <typename octet_iterator> -uint32_t prior(octet_iterator& it); +utfchar32_t prior(octet_iterator& it); ``` `it`: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point. @@ -1241,6 +1864,7 @@ assert (w == twochars); This is a faster but less safe version of `utf8::prior`. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking. +<!-- TOC --><a name="utf8uncheckedadvance"></a> #### utf8::unchecked::advance Available in version 1.0 and later. @@ -1266,11 +1890,12 @@ assert (w == twochars + 5); This is a faster but less safe version of `utf8::advance`. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking. +<!-- TOC --><a name="utf8uncheckeddistance"></a> #### utf8::unchecked::distance Available in version 1.0 and later. -Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them. +Given the iterators to two UTF-8 encoded code points in a sequence, returns the number of code points between them. ```cpp template <typename octet_iterator> @@ -1291,6 +1916,7 @@ assert (dist == 2); This is a faster but less safe version of `utf8::distance`. It does not check for validity of the supplied UTF-8 sequence. +<!-- TOC --><a name="utf8uncheckedutf16to8"></a> #### utf8::unchecked::utf16to8 Available in version 1.0 and later. @@ -1318,6 +1944,7 @@ assert (utf8result.size() == 10); This is a faster but less safe version of `utf8::utf16to8`. It does not check for validity of the supplied UTF-16 sequence. +<!-- TOC --><a name="utf8uncheckedutf8to16"></a> #### utf8::unchecked::utf8to16 Available in version 1.0 and later. @@ -1329,7 +1956,8 @@ template <typename u16bit_iterator, typename octet_iterator> u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); ``` -`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> `end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. `result`: an output iterator to the place in the UTF-16 string where to append the result of conversion. Return value: An iterator pointing to the place after the appended UTF-16 string. @@ -1346,6 +1974,7 @@ assert (utf16result[3] == 0xdd1e); This is a faster but less safe version of `utf8::utf8to16`. It does not check for validity of the supplied UTF-8 sequence. +<!-- TOC --><a name="utf8uncheckedutf32to8"></a> #### utf8::unchecked::utf32to8 Available in version 1.0 and later. @@ -1373,6 +2002,7 @@ assert (utf8result.size() == 9); This is a faster but less safe version of `utf8::utf32to8`. It does not check for validity of the supplied UTF-32 sequence. +<!-- TOC --><a name="utf8uncheckedutf8to32"></a> #### utf8::unchecked::utf8to32 Available in version 1.0 and later. @@ -1400,6 +2030,7 @@ assert (utf32result.size() == 2); This is a faster but less safe version of `utf8::utf8to32`. It does not check for validity of the supplied UTF-8 sequence. +<!-- TOC --><a name="utf8uncheckedreplace_invalid"></a> #### utf8::unchecked::replace_invalid Available in version 3.1 and later. @@ -1408,7 +2039,7 @@ Replaces all invalid UTF-8 sequences within a string with a replacement marker. ```cpp template <typename octet_iterator, typename output_iterator> -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement); template <typename octet_iterator, typename output_iterator> output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); ``` @@ -1437,8 +2068,10 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), Unlike `utf8::replace_invalid`, this function does not verify validity of the replacement marker. +<!-- TOC --><a name="types-from-utf8unchecked-namespace"></a> ### Types From utf8::unchecked Namespace +<!-- TOC --><a name="utf8iterator-1"></a> #### utf8::iterator Available in version 2.0 and later. @@ -1450,6 +2083,7 @@ template <typename octet_iterator> class iterator; ``` +<!-- TOC --><a name="member-functions-1"></a> ##### Member functions `iterator();` the deafult constructor; the underlying octet_iterator is constructed with its default constructor. @@ -1458,11 +2092,11 @@ class iterator; `octet_iterator base () const;` returns the underlying octet_iterator. -`uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. +`utfchar32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. -`bool operator == (const iterator& rhs) const;` returns `true` if the two underlaying iterators are equal. +`bool operator == (const iterator& rhs) const;` returns `true` if the two underlying iterators are equal. -`bool operator != (const iterator& rhs) const;` returns `true` if the two underlaying iterators are not equal. +`bool operator != (const iterator& rhs) const;` returns `true` if the two underlying iterators are not equal. `iterator& operator ++ ();` the prefix increment - moves the iterator to the next UTF-8 encoded code point. @@ -1495,9 +2129,3 @@ assert (*un_it == 0x10346); This is an unchecked version of `utf8::iterator`. It is faster in many cases, but offers no validity or range checks. -## Links - -1. [The Unicode Consortium](http://www.unicode.org/). -2. [ICU Library](http://icu.sourceforge.net/). -3. [UTF-8 at Wikipedia](http://en.wikipedia.org/wiki/UTF-8) -4. [UTF-8 and Unicode FAQ for Unix/Linux](http://www.cl.cam.ac.uk/~mgk25/unicode.html) diff --git a/lib/utfcpp/v3/source/utf8.h b/lib/utfcpp/v4/source/utf8.h index 82b13f59..b5135309 100644 --- a/lib/utfcpp/v3/source/utf8.h +++ b/lib/utfcpp/v4/source/utf8.h @@ -28,6 +28,18 @@ DEALINGS IN THE SOFTWARE. #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +/* +To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro +and set it to one of the values used by the __cplusplus predefined macro. + +For instance, + #define UTF_CPP_CPLUSPLUS 199711L +will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. +Some library features will be disabled. + +If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. +*/ + #include "utf8/checked.h" #include "utf8/unchecked.h" diff --git a/lib/utfcpp/v3/source/utf8/checked.h b/lib/utfcpp/v4/source/utf8/checked.h index 512dcc2f..98949f8b 100644 --- a/lib/utfcpp/v3/source/utf8/checked.h +++ b/lib/utfcpp/v4/source/utf8/checked.h @@ -39,28 +39,28 @@ namespace utf8 // Exceptions that may be thrown from the library functions. class invalid_code_point : public exception { - uint32_t cp; + utfchar32_t cp; public: - invalid_code_point(uint32_t codepoint) : cp(codepoint) {} + invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } - uint32_t code_point() const {return cp;} + utfchar32_t code_point() const {return cp;} }; class invalid_utf8 : public exception { - uint8_t u8; + utfchar8_t u8; public: - invalid_utf8 (uint8_t u) : u8(u) {} - invalid_utf8 (char c) : u8(static_cast<uint8_t>(c)) {} + invalid_utf8 (utfchar8_t u) : u8(u) {} + invalid_utf8 (char c) : u8(static_cast<utfchar8_t>(c)) {} virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} + utfchar8_t utf8_octet() const {return u8;} }; class invalid_utf16 : public exception { - uint16_t u16; + utfchar16_t u16; public: - invalid_utf16 (uint16_t u) : u16(u) {} + invalid_utf16 (utfchar16_t u) : u16(u) {} virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} + utfchar16_t utf16_word() const {return u16;} }; class not_enough_room : public exception { @@ -71,7 +71,7 @@ namespace utf8 /// The library API - functions intended to be called by the users template <typename octet_iterator> - octet_iterator append(uint32_t cp, octet_iterator result) + octet_iterator append(utfchar32_t cp, octet_iterator result) { if (!utf8::internal::is_code_point_valid(cp)) throw invalid_code_point(cp); @@ -79,8 +79,22 @@ namespace utf8 return internal::append(cp, result); } + inline void append(utfchar32_t cp, std::string& s) + { + append(cp, std::back_inserter(s)); + } + + template <typename word_iterator> + word_iterator append16(utfchar32_t cp, word_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + template <typename octet_iterator, typename output_iterator> - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) { while (start != end) { octet_iterator sequence_start = start; @@ -115,14 +129,28 @@ namespace utf8 template <typename octet_iterator, typename output_iterator> inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + static const utfchar32_t replacement_marker = utf8::internal::mask16(0xfffd); return utf8::replace_invalid(start, end, out, replacement_marker); } + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + template <typename octet_iterator> - uint32_t next(octet_iterator& it, octet_iterator end) + utfchar32_t next(octet_iterator& it, octet_iterator end) { - uint32_t cp = 0; + utfchar32_t cp = 0; internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); switch (err_code) { case internal::UTF8_OK : @@ -132,21 +160,31 @@ namespace utf8 case internal::INVALID_LEAD : case internal::INCOMPLETE_SEQUENCE : case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(static_cast<uint8_t>(*it)); + throw invalid_utf8(static_cast<utfchar8_t>(*it)); case internal::INVALID_CODE_POINT : throw invalid_code_point(cp); } return cp; } + template <typename word_iterator> + utfchar32_t next16(word_iterator& it, word_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); + if (err_code == internal::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; + } + template <typename octet_iterator> - uint32_t peek_next(octet_iterator it, octet_iterator end) + utfchar32_t peek_next(octet_iterator it, octet_iterator end) { return utf8::next(it, end); } template <typename octet_iterator> - uint32_t prior(octet_iterator& it, octet_iterator start) + utfchar32_t prior(octet_iterator& it, octet_iterator start) { // can't do much if it == start if (it == start) @@ -189,23 +227,23 @@ namespace utf8 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) { while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); + utfchar32_t cp = utf8::internal::mask16(*start++); // Take care of surrogate pairs first if (utf8::internal::is_lead_surrogate(cp)) { if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); + const utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); if (utf8::internal::is_trail_surrogate(trail_surrogate)) cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; else - throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); + throw invalid_utf16(static_cast<utfchar16_t>(trail_surrogate)); } else - throw invalid_utf16(static_cast<uint16_t>(cp)); + throw invalid_utf16(static_cast<utfchar16_t>(cp)); } // Lone trail surrogate else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast<uint16_t>(cp)); + throw invalid_utf16(static_cast<utfchar16_t>(cp)); result = utf8::append(cp, result); } @@ -216,13 +254,13 @@ namespace utf8 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) { while (start < end) { - uint32_t cp = utf8::next(start, end); + const utfchar32_t cp = utf8::next(start, end); if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); } else - *result++ = static_cast<uint16_t>(cp); + *result++ = static_cast<utfchar16_t>(cp); } return result; } @@ -252,9 +290,9 @@ namespace utf8 octet_iterator range_start; octet_iterator range_end; public: - typedef uint32_t value_type; - typedef uint32_t* pointer; - typedef uint32_t& reference; + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; typedef std::ptrdiff_t difference_type; typedef std::bidirectional_iterator_tag iterator_category; iterator () {} @@ -268,7 +306,7 @@ namespace utf8 } // the default "big three" are OK octet_iterator base () const { return it; } - uint32_t operator * () const + utfchar32_t operator * () const { octet_iterator temp = it; return utf8::next(temp, range_end); @@ -309,7 +347,9 @@ namespace utf8 } // namespace utf8 -#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later +#include "cpp20.h" +#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later #include "cpp17.h" #elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later #include "cpp11.h" diff --git a/lib/utfcpp/v3/source/utf8/core.h b/lib/utfcpp/v4/source/utf8/core.h index 34371ee3..4494c538 100644 --- a/lib/utfcpp/v3/source/utf8/core.h +++ b/lib/utfcpp/v4/source/utf8/core.h @@ -29,6 +29,8 @@ DEALINGS IN THE SOFTWARE. #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 #include <iterator> +#include <cstring> +#include <string> // Determine the C++ standard version. // If the user defines UTF_CPP_CPLUSPLUS, use that. @@ -49,12 +51,20 @@ DEALINGS IN THE SOFTWARE. namespace utf8 { - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; +// The typedefs for 8-bit, 16-bit and 32-bit code units +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later + typedef char8_t utfchar8_t; + #else // C++ 11/14/17 + typedef unsigned char utfchar8_t; + #endif + typedef char16_t utfchar16_t; + typedef char32_t utfchar32_t; +#else // C++ 98/03 + typedef unsigned char utfchar8_t; + typedef unsigned short utfchar16_t; + typedef unsigned int utfchar32_t; +#endif // C++ 11 or later // Helper code - not intended to be directly called by the library users. May be changed at any time namespace internal @@ -62,61 +72,62 @@ namespace internal // Unicode constants // Leading (high) surrogates: 0xd800 - 0xdbff // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) - const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; + const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; + const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; + const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; template<typename octet_type> - inline uint8_t mask8(octet_type oc) + inline utfchar8_t mask8(octet_type oc) { - return static_cast<uint8_t>(0xff & oc); + return static_cast<utfchar8_t>(0xff & oc); } template<typename u16_type> - inline uint16_t mask16(u16_type oc) + inline utfchar16_t mask16(u16_type oc) { - return static_cast<uint16_t>(0xffff & oc); + return static_cast<utfchar16_t>(0xffff & oc); } + template<typename octet_type> inline bool is_trail(octet_type oc) { return ((utf8::internal::mask8(oc) >> 6) == 0x2); } - template <typename u16> - inline bool is_lead_surrogate(u16 cp) + inline bool is_lead_surrogate(utfchar32_t cp) { return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); } - template <typename u16> - inline bool is_trail_surrogate(u16 cp) + inline bool is_trail_surrogate(utfchar32_t cp) { return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); } - template <typename u16> - inline bool is_surrogate(u16 cp) + inline bool is_surrogate(utfchar32_t cp) { return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); } - template <typename u32> - inline bool is_code_point_valid(u32 cp) + inline bool is_code_point_valid(utfchar32_t cp) { return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); } + inline bool is_in_bmp(utfchar32_t cp) + { + return cp < utfchar32_t(0x10000); + } + template <typename octet_iterator> - inline typename std::iterator_traits<octet_iterator>::difference_type - sequence_length(octet_iterator lead_it) + int sequence_length(octet_iterator lead_it) { - uint8_t lead = utf8::internal::mask8(*lead_it); + const utfchar8_t lead = utf8::internal::mask8(*lead_it); if (lead < 0x80) return 1; else if ((lead >> 5) == 0x6) @@ -129,8 +140,7 @@ namespace internal return 0; } - template <typename octet_difference_type> - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + inline bool is_overlong_sequence(utfchar32_t cp, int length) { if (cp < 0x80) { if (length != 1) @@ -144,7 +154,6 @@ namespace internal if (length != 3) return true; } - return false; } @@ -152,7 +161,7 @@ namespace internal /// Helper for get_sequence_x template <typename octet_iterator> - utf_error increase_safely(octet_iterator& it, octet_iterator end) + utf_error increase_safely(octet_iterator& it, const octet_iterator end) { if (++it == end) return NOT_ENOUGH_ROOM; @@ -163,11 +172,11 @@ namespace internal return UTF8_OK; } - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} /// get_sequence_x functions decode utf-8 sequences of the length x template <typename octet_iterator> - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -178,7 +187,7 @@ namespace internal } template <typename octet_iterator> - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -193,7 +202,7 @@ namespace internal } template <typename octet_iterator> - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -212,7 +221,7 @@ namespace internal } template <typename octet_iterator> - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -237,7 +246,7 @@ namespace internal #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR template <typename octet_iterator> - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -246,10 +255,9 @@ namespace internal // Of course, it does not make much sense with i.e. stream iterators octet_iterator original_it = it; - uint32_t cp = 0; + utfchar32_t cp = 0; // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); + const int length = utf8::internal::sequence_length(it); // Get trail octets and calculate the code point utf_error err = UTF8_OK; @@ -293,15 +301,51 @@ namespace internal template <typename octet_iterator> inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - uint32_t ignored; + utfchar32_t ignored; return utf8::internal::validate_next(it, end, ignored); } + template <typename word_iterator> + utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + word_iterator original_it = it; + + utf_error err = UTF8_OK; + + const utfchar16_t first_word = *it++; + if (!is_surrogate(first_word)) { + code_point = first_word; + return UTF8_OK; + } + else { + if (it == end) + err = NOT_ENOUGH_ROOM; + else if (is_lead_surrogate(first_word)) { + const utfchar16_t second_word = *it++; + if (is_trail_surrogate(second_word)) { + code_point = (first_word << 10) + second_word + SURROGATE_OFFSET; + return UTF8_OK; + } else + err = INCOMPLETE_SEQUENCE; + + } else { + err = INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + // Internal implementation of both checked and unchecked append() function // This function will be invoked by the overloads below, as they will know // the octet_type. template <typename octet_iterator, typename octet_type> - octet_iterator append(uint32_t cp, octet_iterator result) { + octet_iterator append(utfchar32_t cp, octet_iterator result) { if (cp < 0x80) // one octet *(result++) = static_cast<octet_type>(cp); else if (cp < 0x800) { // two octets @@ -325,7 +369,7 @@ namespace internal // One of the following overloads will be invoked from the API calls // A simple (but dangerous) case: the caller appends byte(s) to a char array - inline char* append(uint32_t cp, char* result) { + inline char* append(utfchar32_t cp, char* result) { return append<char*, char>(cp, result); } @@ -333,17 +377,49 @@ namespace internal // i.e. append(cp, std::back_inserter(str)); template<typename container_type> std::back_insert_iterator<container_type> append - (uint32_t cp, std::back_insert_iterator<container_type> result) { + (utfchar32_t cp, std::back_insert_iterator<container_type> result) { return append<std::back_insert_iterator<container_type>, typename container_type::value_type>(cp, result); } // The caller uses some other kind of output operator - not covered above // Note that in this case we are not able to determine octet_type - // so we assume it's uint_8; that can cause a conversion warning if we are wrong. + // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. template <typename octet_iterator> - octet_iterator append(uint32_t cp, octet_iterator result) { - return append<octet_iterator, uint8_t>(cp, result); + octet_iterator append(utfchar32_t cp, octet_iterator result) { + return append<octet_iterator, utfchar8_t>(cp, result); + } + + // Internal implementation of both checked and unchecked append16() function + // This function will be invoked by the overloads below, as they will know + // the word_type. + template <typename word_iterator, typename word_type> + word_iterator append16(utfchar32_t cp, word_iterator result) { + if (is_in_bmp(cp)) + *(result++) = static_cast<word_type>(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append16(cp, std::back_inserter(str)); + template<typename container_type> + std::back_insert_iterator<container_type> append16 + (utfchar32_t cp, std::back_insert_iterator<container_type> result) { + return append16<std::back_insert_iterator<container_type>, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine word_type + // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. + template <typename word_iterator> + word_iterator append16(utfchar32_t cp, word_iterator result) { + return append16<word_iterator, utfchar16_t>(cp, result); } } // namespace internal @@ -351,7 +427,7 @@ namespace internal /// The library API - functions intended to be called by the users // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; template <typename octet_iterator> octet_iterator find_invalid(octet_iterator start, octet_iterator end) @@ -365,12 +441,36 @@ namespace internal return result; } + inline const char* find_invalid(const char* str) + { + const char* end = str + std::strlen(str); + return find_invalid(str, end); + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin()); + } + template <typename octet_iterator> inline bool is_valid(octet_iterator start, octet_iterator end) { return (utf8::find_invalid(start, end) == end); } + inline bool is_valid(const char* str) + { + return (*(utf8::find_invalid(str)) == '\0'); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + + template <typename octet_iterator> inline bool starts_with_bom (octet_iterator it, octet_iterator end) { @@ -379,7 +479,12 @@ namespace internal ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) ); - } + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } } // namespace utf8 #endif // header guard diff --git a/lib/utfcpp/v3/source/utf8/cpp11.h b/lib/utfcpp/v4/source/utf8/cpp11.h index 2366f129..691633c8 100644 --- a/lib/utfcpp/v3/source/utf8/cpp11.h +++ b/lib/utfcpp/v4/source/utf8/cpp11.h @@ -29,14 +29,12 @@ DEALINGS IN THE SOFTWARE. #define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 #include "checked.h" -#include <string> namespace utf8 { - - inline void append(char32_t cp, std::string& s) + inline void append16(utfchar32_t cp, std::u16string& s) { - append(uint32_t(cp), std::back_inserter(s)); + append16(cp, std::back_inserter(s)); } inline std::string utf16to8(const std::u16string& s) @@ -66,37 +64,6 @@ namespace utf8 utf8to32(s.begin(), s.end(), std::back_inserter(result)); return result; } - - inline std::size_t find_invalid(const std::string& s) - { - std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin()); - } - - inline bool is_valid(const std::string& s) - { - return is_valid(s.begin(), s.end()); - } - - inline std::string replace_invalid(const std::string& s, char32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(const std::string& s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline bool starts_with_bom(const std::string& s) - { - return starts_with_bom(s.begin(), s.end()); - } - } // namespace utf8 #endif // header guard diff --git a/lib/utfcpp/v3/source/utf8/cpp17.h b/lib/utfcpp/v4/source/utf8/cpp17.h index 32a77ce3..6e2fcc23 100644 --- a/lib/utfcpp/v3/source/utf8/cpp17.h +++ b/lib/utfcpp/v4/source/utf8/cpp17.h @@ -28,17 +28,10 @@ DEALINGS IN THE SOFTWARE. #ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 #define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 -#include "checked.h" -#include <string> +#include "cpp11.h" namespace utf8 { - - inline void append(char32_t cp, std::string& s) - { - append(uint32_t(cp), std::back_inserter(s)); - } - inline std::string utf16to8(std::u16string_view s) { std::string result; diff --git a/lib/utfcpp/v4/source/utf8/cpp20.h b/lib/utfcpp/v4/source/utf8/cpp20.h new file mode 100644 index 00000000..07b61d0f --- /dev/null +++ b/lib/utfcpp/v4/source/utf8/cpp20.h @@ -0,0 +1,124 @@ +// Copyright 2022 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp17.h" + +namespace utf8 +{ + inline std::u8string utf16tou8(const std::u16string& s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string_view& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string_view& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string_view& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::u8string& s) + { + std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin()); + } + + inline bool is_valid(const std::u8string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::u8string replace_invalid(const std::u8string& s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::u8string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/lib/utfcpp/v3/source/utf8/unchecked.h b/lib/utfcpp/v4/source/utf8/unchecked.h index 8fe83c9e..65d4948f 100644 --- a/lib/utfcpp/v3/source/utf8/unchecked.h +++ b/lib/utfcpp/v4/source/utf8/unchecked.h @@ -35,13 +35,19 @@ namespace utf8 namespace unchecked { template <typename octet_iterator> - octet_iterator append(uint32_t cp, octet_iterator result) + octet_iterator append(utfchar32_t cp, octet_iterator result) { return internal::append(cp, result); } + template <typename word_iterator> + word_iterator append16(utfchar32_t cp, word_iterator result) + { + return internal::append16(cp, result); + } + template <typename octet_iterator, typename output_iterator> - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) { while (start != end) { octet_iterator sequence_start = start; @@ -52,17 +58,17 @@ namespace utf8 *out++ = *it; break; case internal::NOT_ENOUGH_ROOM: - out = utf8::unchecked::append (replacement, out); + out = utf8::unchecked::append(replacement, out); start = end; break; case internal::INVALID_LEAD: - out = utf8::unchecked::append (replacement, out); + out = utf8::unchecked::append(replacement, out); ++start; break; case internal::INCOMPLETE_SEQUENCE: case internal::OVERLONG_SEQUENCE: case internal::INVALID_CODE_POINT: - out = utf8::unchecked::append (replacement, out); + out = utf8::unchecked::append(replacement, out); ++start; // just one replacement mark for the sequence while (start != end && utf8::internal::is_trail(*start)) @@ -76,16 +82,29 @@ namespace utf8 template <typename octet_iterator, typename output_iterator> inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + static const utfchar32_t replacement_marker = utf8::internal::mask16(0xfffd); return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); } + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + template <typename octet_iterator> - uint32_t next(octet_iterator& it) + utfchar32_t next(octet_iterator& it) { - uint32_t cp = utf8::internal::mask8(*it); - typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it); - switch (length) { + utfchar32_t cp = utf8::internal::mask8(*it); + switch (utf8::internal::sequence_length(it)) { case 1: break; case 2: @@ -112,13 +131,22 @@ namespace utf8 } template <typename octet_iterator> - uint32_t peek_next(octet_iterator it) + utfchar32_t peek_next(octet_iterator it) { return utf8::unchecked::next(it); } + template <typename word_iterator> + utfchar32_t next16(word_iterator& it) + { + utfchar32_t cp = utf8::internal::mask16(*it++); + if (utf8::internal::is_lead_surrogate(cp)) + return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; + return cp; + } + template <typename octet_iterator> - uint32_t prior(octet_iterator& it) + utfchar32_t prior(octet_iterator& it) { while (utf8::internal::is_trail(*(--it))) ; octet_iterator temp = it; @@ -126,7 +154,7 @@ namespace utf8 } template <typename octet_iterator, typename distance_type> - void advance (octet_iterator& it, distance_type n) + void advance(octet_iterator& it, distance_type n) { const distance_type zero(0); if (n < zero) { @@ -142,7 +170,7 @@ namespace utf8 template <typename octet_iterator> typename std::iterator_traits<octet_iterator>::difference_type - distance (octet_iterator first, octet_iterator last) + distance(octet_iterator first, octet_iterator last) { typename std::iterator_traits<octet_iterator>::difference_type dist; for (dist = 0; first < last; ++dist) @@ -151,13 +179,15 @@ namespace utf8 } template <typename u16bit_iterator, typename octet_iterator> - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) { while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first + utfchar32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first if (utf8::internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); + if (start == end) + return result; + utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; } result = utf8::unchecked::append(cp, result); @@ -166,22 +196,22 @@ namespace utf8 } template <typename u16bit_iterator, typename octet_iterator> - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) { while (start < end) { - uint32_t cp = utf8::unchecked::next(start); + utfchar32_t cp = utf8::unchecked::next(start); if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); } else - *result++ = static_cast<uint16_t>(cp); + *result++ = static_cast<utfchar16_t>(cp); } return result; } template <typename octet_iterator, typename u32bit_iterator> - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) { while (start != end) result = utf8::unchecked::append(*(start++), result); @@ -190,7 +220,7 @@ namespace utf8 } template <typename octet_iterator, typename u32bit_iterator> - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) { while (start < end) (*result++) = utf8::unchecked::next(start); @@ -203,16 +233,16 @@ namespace utf8 class iterator { octet_iterator it; public: - typedef uint32_t value_type; - typedef uint32_t* pointer; - typedef uint32_t& reference; + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; typedef std::ptrdiff_t difference_type; typedef std::bidirectional_iterator_tag iterator_category; iterator () {} explicit iterator (const octet_iterator& octet_it): it(octet_it) {} // the default "big three" are OK octet_iterator base () const { return it; } - uint32_t operator * () const + utfchar32_t operator * () const { octet_iterator temp = it; return utf8::unchecked::next(temp); diff --git a/lib/utfcpp/v3/tests/CMakeLists.txt b/lib/utfcpp/v4/tests/CMakeLists.txt index f3ce2584..8a00a6a0 100644 --- a/lib/utfcpp/v3/tests/CMakeLists.txt +++ b/lib/utfcpp/v4/tests/CMakeLists.txt @@ -1,15 +1,20 @@ -add_executable(negative ${PROJECT_SOURCE_DIR}/tests/negative.cpp) -add_executable(cpp11 ${PROJECT_SOURCE_DIR}/tests/test_cpp11.cpp) -add_executable(cpp17 ${PROJECT_SOURCE_DIR}/tests/test_cpp17.cpp) -add_executable(apitests ${PROJECT_SOURCE_DIR}/tests/apitests.cpp) +cmake_minimum_required (VERSION 3.5) +project(utfcpptests LANGUAGES CXX) +enable_testing() + +add_library(${PROJECT_NAME} INTERFACE) + +include_directories("${PROJECT_SOURCE_DIR}/../source") + +add_executable(negative negative.cpp) +add_executable(cpp11 test_cpp11.cpp) +add_executable(cpp17 test_cpp17.cpp) +add_executable(cpp20 test_cpp20.cpp) +add_executable(apitests apitests.cpp) + +add_executable(noexceptionstests noexceptionstests.cpp) -add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.cpp) -target_link_libraries(negative PRIVATE utf8::cpp) -target_link_libraries(cpp11 PRIVATE utf8::cpp) -target_link_libraries(cpp17 PRIVATE utf8::cpp) -target_link_libraries(apitests PRIVATE utf8::cpp) -target_link_libraries(noexceptionstests PRIVATE utf8::cpp) target_compile_options(${PROJECT_NAME} INTERFACE $<$<CXX_COMPILER_ID:MSVC>:/W4> @@ -35,9 +40,17 @@ set_target_properties(cpp17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO) -add_test(negative_test negative ${PROJECT_SOURCE_DIR}/tests/test_data/utf8_invalid.txt) +set_target_properties(cpp20 + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO) + + +add_test(negative_test negative ${PROJECT_SOURCE_DIR}/test_data/utf8_invalid.txt) add_test(cpp11_test cpp11) add_test(cpp17_test cpp17) +add_test(cpp20_test cpp20) add_test(api_test apitests) add_test(noexceptions_test noexceptionstests) diff --git a/lib/utfcpp/v3/tests/apitests.cpp b/lib/utfcpp/v4/tests/apitests.cpp index 083266d7..083266d7 100644 --- a/lib/utfcpp/v3/tests/apitests.cpp +++ b/lib/utfcpp/v4/tests/apitests.cpp diff --git a/lib/utfcpp/v3/tests/docker/Dockerfile b/lib/utfcpp/v4/tests/docker/Dockerfile index 9df3717a..dcdd47d1 100644 --- a/lib/utfcpp/v3/tests/docker/Dockerfile +++ b/lib/utfcpp/v4/tests/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM debian:buster-slim +FROM gcc:12.2 RUN apt-get update \ && apt-get install -y make g++ cmake git \ diff --git a/lib/utfcpp/v3/tests/negative.cpp b/lib/utfcpp/v4/tests/negative.cpp index f1bcc993..f1bcc993 100644 --- a/lib/utfcpp/v3/tests/negative.cpp +++ b/lib/utfcpp/v4/tests/negative.cpp diff --git a/lib/utfcpp/v3/tests/noexceptionstests.cpp b/lib/utfcpp/v4/tests/noexceptionstests.cpp index 108ee750..108ee750 100644 --- a/lib/utfcpp/v3/tests/noexceptionstests.cpp +++ b/lib/utfcpp/v4/tests/noexceptionstests.cpp diff --git a/lib/utfcpp/v3/tests/test_checked_api.h b/lib/utfcpp/v4/tests/test_checked_api.h index 3a7067b7..54e9cf8f 100644 --- a/lib/utfcpp/v3/tests/test_checked_api.h +++ b/lib/utfcpp/v4/tests/test_checked_api.h @@ -47,6 +47,22 @@ TEST(CheckedAPITests, test_append) EXPECT_EQ (c[1], 0); } +TEST(CheckedAPITests, test_append16) +{ + utfchar16_t u[5] = {0,0}; + append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + TEST(CheckedAPITests, test_next) { const char* twochars = "\xe6\x97\xa5\xd1\x88"; @@ -71,6 +87,19 @@ TEST(CheckedAPITests, test_next) EXPECT_EQ (w, threechars + 9); } +TEST(CheckedAPITests, test_next16) +{ + const utfchar16_t u[3] = {0x65e5, 0xd800, 0xdf46}; + const utfchar16_t* w = u; + utf8::utfchar32_t cp = next16(w, w + 3); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, u + 1); + + cp = next16(w, w + 2); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, u + 3); +} + TEST(CheckedAPITests, test_peek_next) { const char* const cw = "\xe6\x97\xa5\xd1\x88"; @@ -171,7 +200,9 @@ TEST(CheckedAPITests, test_replace_invalid) TEST(CheckedAPITests, test_find_invalid) { char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + const char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + EXPECT_EQ (invalid, utf_invalid + 5); + invalid = find_invalid(utf_invalid); EXPECT_EQ (invalid, utf_invalid + 5); } @@ -180,9 +211,13 @@ TEST(CheckedAPITests, test_is_valid) char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid, utf_invalid + 6); EXPECT_FALSE (bvalid); + bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); EXPECT_TRUE (bvalid); + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); } TEST(CheckedAPITests, test_starts_with_bom) diff --git a/lib/utfcpp/v3/tests/test_checked_iterator.h b/lib/utfcpp/v4/tests/test_checked_iterator.h index 2829a734..2829a734 100644 --- a/lib/utfcpp/v3/tests/test_checked_iterator.h +++ b/lib/utfcpp/v4/tests/test_checked_iterator.h diff --git a/lib/utfcpp/v3/tests/test_cpp11.cpp b/lib/utfcpp/v4/tests/test_cpp11.cpp index ee4ddd8f..e5219668 100644 --- a/lib/utfcpp/v3/tests/test_cpp11.cpp +++ b/lib/utfcpp/v4/tests/test_cpp11.cpp @@ -37,11 +37,24 @@ TEST(CPP11APITests, test_append) EXPECT_EQ (u.length(), 4); } +TEST(CPP11APITests, test_append16) +{ + u16string u; + append16(0x0448, u); + EXPECT_EQ (u[0], char16_t(0x0448)); + EXPECT_EQ (u.length(), 1); +} + TEST(CPP11APITests, test_utf16to8) { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; string u = utf16to8(utf16string); EXPECT_EQ (u.size(), 10); + + u16string h16 = u"h!"; + string h8; + utf8::unchecked::utf16to8(h16.begin(), h16.end(), std::back_inserter(h8)); + EXPECT_EQ (h8, "h!"); } TEST(CPP11APITests, test_utf8to16) diff --git a/lib/utfcpp/v3/tests/test_cpp17.cpp b/lib/utfcpp/v4/tests/test_cpp17.cpp index 4b87816b..a38e6f71 100644 --- a/lib/utfcpp/v3/tests/test_cpp17.cpp +++ b/lib/utfcpp/v4/tests/test_cpp17.cpp @@ -10,8 +10,8 @@ using namespace std; TEST(CPP17APITests, test_utf16to8) { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u16string_view utf16stringview(u16string); - string u = utf16to8(utf16string); + u16string_view utf16stringview(utf16string); + string u = utf16to8(utf16stringview); EXPECT_EQ (u.size(), 10); } diff --git a/lib/utfcpp/v4/tests/test_cpp20.cpp b/lib/utfcpp/v4/tests/test_cpp20.cpp new file mode 100644 index 00000000..50dbe30a --- /dev/null +++ b/lib/utfcpp/v4/tests/test_cpp20.cpp @@ -0,0 +1,77 @@ +#include "../extern/ftest/ftest.h" +#define UTF_CPP_CPLUSPLUS 202002L +#include "utf8.h" +#include <string> +using namespace utf8; +using namespace std; + +TEST(CPP20APITests, test_utf16tou8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview{utf16string}; + u8string u = utf16tou8(utf16string); + EXPECT_EQ (u.size(), 10); + u = utf16tou8(utf16stringview); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP20APITests, tes20t_utf8to16) +{ + u8string utf8_with_surrogates{u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"}; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP20APITests, test_utf32tou8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + u32string_view utf32stringview{utf32string}; + u8string utf8result = utf32tou8(utf32stringview); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP20APITests, test_utf8to32) +{ + u8string twochars = u8"\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP20APITests, test_find_invalid) +{ + u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP20APITests, test_is_valid) +{ + u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + u8string utf8_with_surrogates = u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP20APITests, test_replace_invalid) +{ + u8string invalid_sequence = u8"a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const u8string fixed_invalid_sequence = u8"a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP20APITests, test_starts_with_bom) +{ + u8string byte_order_mark = u8"\xef\xbb\xbf"; + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + u8string threechars = u8"\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} diff --git a/lib/utfcpp/v3/tests/test_data/utf8_invalid.txt b/lib/utfcpp/v4/tests/test_data/utf8_invalid.txt Binary files differindex ae831593..ae831593 100644 --- a/lib/utfcpp/v3/tests/test_data/utf8_invalid.txt +++ b/lib/utfcpp/v4/tests/test_data/utf8_invalid.txt diff --git a/lib/utfcpp/v3/tests/test_unchecked_api.h b/lib/utfcpp/v4/tests/test_unchecked_api.h index 10c5991e..aa0cf697 100644 --- a/lib/utfcpp/v3/tests/test_unchecked_api.h +++ b/lib/utfcpp/v4/tests/test_unchecked_api.h @@ -40,6 +40,22 @@ TEST(UnCheckedAPITests, test_append) EXPECT_EQ (u[4], 0); } +TEST(UnCheckedAPITests, test_append16) +{ + unsigned short u[5] = {0,0}; + utf8::unchecked::append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + utf8::unchecked::append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + utf8::unchecked::append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + TEST(UnCheckedAPITests, test_next) { const char* twochars = "\xe6\x97\xa5\xd1\x88"; @@ -64,6 +80,19 @@ TEST(UnCheckedAPITests, test_next) EXPECT_EQ (w, threechars + 9); } +TEST(UnCheckedAPITests, test_next16) +{ + const utf8::utfchar16_t u[3] = {0x65e5, 0xd800, 0xdf46}; + const utf8::utfchar16_t* w = u; + utf8::utfchar32_t cp = utf8::unchecked::next16(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, u + 1); + + cp = utf8::unchecked::next16(w); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, u + 3); +} + TEST(UnCheckedAPITests, test_peek_next) { const char* const cw = "\xe6\x97\xa5\xd1\x88"; @@ -137,6 +166,11 @@ TEST(UnCheckedAPITests, test_utf16to8) string utf8result; utf8::unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); EXPECT_EQ (utf8result.size(), 10); + + utf8result.clear(); + unsigned short highsurrogateonly[] = {0xd800}; + utf8::unchecked::utf16to8(highsurrogateonly, highsurrogateonly + 1, back_inserter(utf8result)); + EXPECT_TRUE(true); // we didn't crash } TEST(UnCheckedAPITests, test_utf8to16) diff --git a/lib/utfcpp/v3/tests/test_unchecked_iterator.h b/lib/utfcpp/v4/tests/test_unchecked_iterator.h index 4294232d..4294232d 100644 --- a/lib/utfcpp/v3/tests/test_unchecked_iterator.h +++ b/lib/utfcpp/v4/tests/test_unchecked_iterator.h diff --git a/lib/utfcpp/v4/utf8cppConfig.cmake.in b/lib/utfcpp/v4/utf8cppConfig.cmake.in new file mode 100644 index 00000000..4bdb9c41 --- /dev/null +++ b/lib/utfcpp/v4/utf8cppConfig.cmake.in @@ -0,0 +1,8 @@ +@PACKAGE_INIT@ + +include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") +check_required_components("@PROJECT_NAME@") + +if(NOT TARGET utf8::cpp) + add_library(utf8::cpp ALIAS utf8cpp::utf8cpp) +endif() |