微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

将 utf-8 小写转换为大写

如何解决将 utf-8 小写转换为大写

我有一些 utf-8 格式的字符串,它们必须转换为大写(和/或反之亦然)。 对于标准 ASCII 字符,这很容易,因为 C++ 为此提供了函数,但对于非 ASCII 字符(如西里尔文、希腊文等),这是一个难题。 我找到了 ICU 库(参见 https://unicode-org.github.io,尤其是 https://unicode-org.github.iohttps://unicode-org.github.io/icu/userguide/https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/)和一个示例 (https://www.delftstack.com/howto/cpp/how-to-convert-string-to-uppercase-cpp/)。

由此我构建了一个示例:

#include <iostream>
#include <string>
#include <algorithm>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
#include <unicode/Locid.h>

using std::cout; using std::string;
using std::endl; using std::cin;
using std::transform;
using std::toupper;

int main() {
    string string0("hello there είναι απλά ένα κείμενο χωρίς");
    string string1("hallo Привет");
    string string2("Hallo Привет");
    string string3("HALLO ПРИВЕТ");

    icu::UnicodeString unicodeString0(string0.c_str());
    cout << "input string:  " << string0 << endl
        <<  "output string: " << unicodeString0.toupper() << endl;

    icu::UnicodeString unicodeString1(string1.c_str());
    cout << "input string:  " << string1 << endl
        <<  "output string: " << unicodeString1.toupper() << endl;

    icu::UnicodeString unicodeString2(string2.c_str());
    cout << "input string:  " << string2 << endl
        <<  "output string: " << unicodeString2.toupper() << endl;

    icu::UnicodeString unicodeString3(string3.c_str());
    cout << "input string:  " << string3 << endl
        <<  "output string: " << unicodeString3.toupper() << endl;

    string string4 = "Contrairement à une opinion répandue";

    icu::UnicodeString unicodeString4(string4.c_str());
    cout << "input string:  " << string4 << endl
        << "output string: " << unicodeString4.toupper() << endl
        << "output string: " << unicodeString4.toupper("fr-FR") << endl;
    return 0;
}

编译它(gcc9.3.0 ICU library 67.1):

g++ s2.cpp -licuio -licuuc -o s2

运行时我得到:

input string:  hello there είναι απλά ένα κείμενο χωρίς
output string: HELLO THERE
input string:  hallo Привет
output string: HALLO
input string:  Hallo Привет
output string: HALLO
input string:  HALLO ПРИВЕТ
output string: HALLO
input string:  Contrairement à une opinion répandue
output string: CONTRAIREMENT  UNE OPINION RPANDUE
output string: CONTRAIREMENT  UNE OPINION RPANDUE

所以我们看到所有的特殊字符都没有显示输出中,我一定错过了一些东西(微不足道的?),虽然我没有看到它。 我不限于 ICU 库,因此也欢迎使用 C++ 中小写和大写之间转换的其他解决方案。

有什么建议吗?

解决方法

检查/测试我的代码 https://repl.it/@JomaCorpFX/ToUpperToLower#main.cpp

对于具有正确绘制字形/unicode 字符(例如“?”)的 Windows,我建议使用新的 Windows Terminal 运行您的程序。

阅读有关 Windows 控制台的内容
Windows Command-Line: Unicode and UTF-8 Output Text Buffer

更新:添加了 CYGWIN 的代码和一些修复。

代码


#include <iostream>
#include <set>
#include <string>
#include <locale>

// WINDOWS
#if (_WIN32)
#include <Windows.h>
#include <conio.h>
#define WINDOWS_PLATFORM 1
#define DLLCALL STDCALL
#define DLLIMPORT _declspec(dllimport)
#define DLLEXPORT _declspec(dllexport)
#define DLLPRIVATE
#define NOMINMAX

#elif __CYGWIN__
#define CYGWIN_PLATFORM 1
#include <windows.h>
#include <unistd.h>
#include <termios.h>
#define DLLCALL CDECL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
#define CoTaskMemAlloc(p) malloc(p)
#define CoTaskMemFree(p) free(p)

//EMSCRIPTEN
#elif defined(__EMSCRIPTEN__)
#include <emscripten/emscripten.h>
#include <emscripten/bind.h>
#include <unistd.h>
#include <termios.h>
#define EMSCRIPTEN_PLATFORM 1
#define DLLCALL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))

// LINUX - Ubuntu,Fedora,Centos,Debian,RedHat
#elif (__LINUX__ || __gnu_linux__ || __linux__ || __linux || linux)
#define LINUX_PLATFORM 1
#include <unistd.h>
#include <termios.h>
#define DLLCALL CDECL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
#define CoTaskMemAlloc(p) malloc(p)
#define CoTaskMemFree(p) free(p)

//ANDROID
#elif (__ANDROID__ || ANDROID)
#define ANDROID_PLATFORM 1
#define DLLCALL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))

//MACOS
#elif defined(__APPLE__)
#include <unistd.h>
#include <termios.h>
#define DLLCALL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE && TARGET_IPHONE_SIMULATOR
#define IOS_SIMULATOR_PLATFORM 1
#elif TARGET_OS_IPHONE
#define IOS_PLATFORM 1
#elif TARGET_OS_MAC
#define MACOS_PLATFORM 1
#else

#endif

#endif

typedef std::string String;
typedef std::wstring WString;

#define EMPTY_STRING u8""s
#define EMPTY_WSTRING L""s

using namespace std::literals::string_literals;

class Strings
{
public:
    static String WideStringToString(const WString &wstr)
    {
        if (wstr.empty())
        {
            return String();
        }
        size_t pos;
        size_t begin = 0;
        String ret;
#if WINDOWS_PLATFORM 

        int size;
        pos = wstr.find(static_cast<wchar_t>(0),begin);
        while (pos != WString::npos && begin < wstr.length())
        {
            WString segment = WString(&wstr[begin],pos - begin);
            size = WideCharToMultiByte(CP_UTF8,WC_ERR_INVALID_CHARS,&segment[0],segment.size(),NULL,NULL);

            String converted = String(size,0);
            WideCharToMultiByte(CP_UTF8,&converted[0],converted.size(),NULL);
            ret.append(converted);
            ret.append({0});
            begin = pos + 1;
            pos = wstr.find(static_cast<wchar_t>(0),begin);
        }
        if (begin <= wstr.length())
        {
            WString segment = WString(&wstr[begin],wstr.length() - begin);
            size = WideCharToMultiByte(CP_UTF8,NULL);
            String converted = String(size,NULL);
            ret.append(converted);
        }
#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
        size_t size;
        pos = wstr.find(static_cast<wchar_t>(0),pos - begin);
            size = wcstombs(nullptr,segment.c_str(),0);
            String converted = String(size,0);
            wcstombs(&converted[0],converted.size());
            ret.append(converted);
            ret.append({0});
            begin = pos + 1;
            pos = wstr.find(static_cast<wchar_t>(0),wstr.length() - begin);
            size = wcstombs(nullptr,converted.size());
            ret.append(converted);
        }
#else
        static_assert(false,"Unknown Platform");
#endif
        return ret;
    }

    static WString StringToWideString(const String &str)
    {
        if (str.empty())
        {
            return WString();
        }

        size_t pos;
        size_t begin = 0;
        WString ret;
#if WINDOWS_PLATFORM
        int size = 0;
        pos = str.find(static_cast<char>(0),begin);
        while (pos != std::string::npos)
        {
            std::string segment = std::string(&str[begin],pos - begin);
            std::wstring converted = std::wstring(segment.size() + 1,0);
            size = MultiByteToWideChar(CP_UTF8,MB_ERR_INVALID_CHARS,converted.length());
            converted.resize(size);
            ret.append(converted);
            ret.append({0});
            begin = pos + 1;
            pos = str.find(static_cast<char>(0),begin);
        }
        if (begin < str.length())
        {
            std::string segment = std::string(&str[begin],str.length() - begin);
            std::wstring converted = std::wstring(segment.size() + 1,converted.length());
            converted.resize(size);
            ret.append(converted);
        }

#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
        size_t size;
        pos = str.find(static_cast<char>(0),begin);
        while (pos != String::npos)
        {
            String segment = String(&str[begin],pos - begin);
            WString converted = WString(segment.size(),0);
            size = mbstowcs(&converted[0],converted.size());
            converted.resize(size);
            ret.append(converted);
            ret.append({0});
            begin = pos + 1;
            pos = str.find(static_cast<char>(0),begin);
        }
        if (begin < str.length())
        {
            String segment = String(&str[begin],str.length() - begin);
            WString converted = WString(segment.size(),converted.size());
            converted.resize(size);
            ret.append(converted);
        }
#else
        static_assert(false,"Unknown Platform");
#endif
        return ret;
    }

    static WString ToUpper(const WString &data)
    {
        WString result = data;
        auto &f = std::use_facet<std::ctype<wchar_t>>(std::locale());

        f.toupper(&result[0],&result[0] + result.size());
        return result;
    }

    static String ToUpper(const String &data)
    {
        return WideStringToString(ToUpper(StringToWideString(data)));
    }

    static WString ToLower(const WString &data)
    {
        WString result = data;
        auto &f = std::use_facet<std::ctype<wchar_t>>(std::locale());
        f.tolower(&result[0],&result[0] + result.size());
        return result;
    }

    static String ToLower(const String &data)
    {
        return WideStringToString(ToLower(StringToWideString(data)));
    }
};

enum class ConsoleTextStyle
{
    DEFAULT = 0,BOLD = 1,FAINT = 2,ITALIC = 3,UNDERLINE = 4,SLOW_BLINK = 5,RAPID_BLINK = 6,REVERSE = 7,};

enum class ConsoleForeground
{
    DEFAULT = 39,BLACK = 30,DARK_RED = 31,DARK_GREEN = 32,DARK_YELLOW = 33,DARK_BLUE = 34,DARK_MAGENTA = 35,DARK_CYAN = 36,GRAY = 37,DARK_GRAY = 90,RED = 91,GREEN = 92,YELLOW = 93,BLUE = 94,MAGENTA = 95,CYAN = 96,WHITE = 97
};

enum class ConsoleBackground
{
    DEFAULT = 49,BLACK = 40,DARK_RED = 41,DARK_GREEN = 42,DARK_YELLOW = 43,DARK_BLUE = 44,DARK_MAGENTA = 45,DARK_CYAN = 46,GRAY = 47,DARK_GRAY = 100,RED = 101,GREEN = 102,YELLOW = 103,BLUE = 104,MAGENTA = 105,CYAN = 106,WHITE = 107
};

class Console
{
private:
    static void EnableVirtualTermimalProcessing()
    {
#if WINDOWS_PLATFORM
        HANDLE hOut = GetStdHandle(STD_OUTPUT_HANDLE);
        DWORD dwMode = 0;
        GetConsoleMode(hOut,&dwMode);
        if (!(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING))
        {
            dwMode |= ENABLE_VIRTUAL_TERMINAL_PROCESSING;
            SetConsoleMode(hOut,dwMode);
        }
#endif
    }

    static void ResetTerminalFormat()
    {
        std::cout << u8"\033[0m";
    }

    static void SetVirtualTerminalFormat(ConsoleForeground foreground,ConsoleBackground background,std::set<ConsoleTextStyle> styles)
    {
        String format = u8"\033[";
        format.append(std::to_string(static_cast<int>(foreground)));
        format.append(u8";");
        format.append(std::to_string(static_cast<int>(background)));
        if (styles.size() > 0)
        {
            for (auto it = styles.begin(); it != styles.end(); ++it)
            {
                format.append(u8";");
                format.append(std::to_string(static_cast<int>(*it)));
            }
        }
        format.append(u8"m");
        std::cout << format;
    }

public:
    static void Clear()
    {

#if WINDOWS_PLATFORM 
        std::system(u8"cls");
#elif LINUX_PLATFORM || defined MACOS_PLATFORM || CYGWIN_PLATFORM
        std::system(u8"clear");
#elif EMSCRIPTEN_PLATFORM
        emscripten::val::global()["console"].call<void>(u8"clear");
#else
        static_assert(false,"Unknown Platform");
#endif
    }

    static void Write(const String &s,ConsoleForeground foreground = ConsoleForeground::DEFAULT,ConsoleBackground background = ConsoleBackground::DEFAULT,std::set<ConsoleTextStyle> styles = {})
    {
#ifndef EMSCRIPTEN_PLATFORM
        EnableVirtualTermimalProcessing();
        SetVirtualTerminalFormat(foreground,background,styles);
#endif
        String str = s;
#if WINDOWS_PLATFORM
        WString unicode = Strings::StringToWideString(str);
        WriteConsole(GetStdHandle(STD_OUTPUT_HANDLE),unicode.c_str(),static_cast<DWORD>(unicode.length()),nullptr,nullptr);
#elif defined LINUX_PLATFORM || defined MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
        std::cout << str;
#else
        static_assert(false,"Unknown Platform");
#endif

#ifndef EMSCRIPTEN_PLATFORM
        ResetTerminalFormat();
#endif
    }

    static void WriteLine(const String &s,std::set<ConsoleTextStyle> styles = {})
    {
        Write(s,foreground,styles);
        std::cout << std::endl;
    }

    static void Write(const WString &s,styles);
#endif
        WString str = s;

#if WINDOWS_PLATFORM 
        WriteConsole(GetStdHandle(STD_OUTPUT_HANDLE),str.c_str(),static_cast<DWORD>(str.length()),nullptr);
#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
        std::cout << Strings::WideStringToString(str);
#else
        static_assert(false,"Unknown Platform");
#endif

#ifndef EMSCRIPTEN_PLATFORM
        ResetTerminalFormat();
#endif
    }

    static void WriteLine(const WString &s,styles);
        std::cout << std::endl;
    }

    static void WriteLine()
    {
        std::cout << std::endl;
    }

    static void Pause()
    {
        char c;
        do
        {
            c = getchar();
            std::cout << "Press Key " << std::endl;
        } while (c != 64);
        std::cout << "KeyPressed" << std::endl;
    }

    static int PauseAny(bool printWhenPressed = false,std::set<ConsoleTextStyle> styles = {})
    {
        int ch;
#ifdef WINDOWS_PLATFORM
        ch = _getch();
#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
        struct termios oldt,newt;
        tcgetattr(STDIN_FILENO,&oldt);
        newt = oldt;
        newt.c_lflag &= ~(ICANON | ECHO);
        tcsetattr(STDIN_FILENO,TCSANOW,&newt);
        ch = getchar();
        tcsetattr(STDIN_FILENO,&oldt);
#else
        static_assert(false,"Unknown Platform");
#endif
        if (printWhenPressed)
        {
            Console::Write(String(1,ch),styles);
        }
        return ch;
    }
};

int main()
{
#if CYGWIN_PLATFORM
    /*
    using c++ std::locale::global(). It generates an error.
    
    terminate called after throwing an instance of 'std::runtime_error'
    what():  locale::facet::_S_create_c_locale name not valid

    it need to be fixed.
    */
    std::setlocale(LC_ALL,u8"en_US.UTF8"); //Calling clasic C locale function all OK. Needed for Console::WriteLine on Linux. If not present throws an error and if not is a unicode locale throws an error. It need to be fixed.
#else
    std::locale::global(std::locale(u8"en_US.UTF8")); //Required for Linux. Error when run without set unicode locale. This need to be investigated and fixed.
#endif
    String dataStr = u8"Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë";
    WString dataWStr = L"Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë";
    Console::WriteLine(dataStr);
    Console::WriteLine(dataWStr);
    dataStr = Strings::ToUpper(dataStr);
    dataWStr = Strings::ToUpper(dataWStr);
    Console::WriteLine(dataStr);
    Console::WriteLine(dataWStr);
    dataStr = Strings::ToLower(dataStr);
    dataWStr = Strings::ToLower(dataWStr);
    Console::WriteLine(dataStr);
    Console::WriteLine(dataWStr);

    //Another examples
    WString string0(L"hello there είναι απλά ένα κείμενο χωρίς");
    WString string1(L"hallo Привет");
    WString string2(L"Hallo Привет");
    WString string3(L"HALLO ПРИВЕТ");
    WString string4 = L"Contrairement à une opinion répandue ?";

    Console::WriteLine(u8"█ Original");
    Console::WriteLine(string0);
    Console::WriteLine(string1);
    Console::WriteLine(string2);
    Console::WriteLine(string3);
    Console::WriteLine(string4);

    Console::WriteLine(u8"█ ToUpper");
    string0 = Strings::ToUpper(string0);
    string1 = Strings::ToUpper(string1);
    string2 = Strings::ToUpper(string2);
    string3 = Strings::ToUpper(string3);
    string4 = Strings::ToUpper(string4);
    Console::WriteLine(string0);
    Console::WriteLine(string1);
    Console::WriteLine(string2);
    Console::WriteLine(string3);
    Console::WriteLine(string4);

    Console::WriteLine(u8"█ ToLower");
    string0 = Strings::ToLower(string0);
    string1 = Strings::ToLower(string1);
    string2 = Strings::ToLower(string2);
    string3 = Strings::ToLower(string3);
    string4 = Strings::ToLower(string4);
    Console::WriteLine(string0);
    Console::WriteLine(string1);
    Console::WriteLine(string2);
    Console::WriteLine(string3);
    Console::WriteLine(string4);

    Console::WriteLine(u8"Press any key to exit"s,ConsoleForeground::DARK_GRAY);
    Console::PauseAny();

    return 0;
}

输出

Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
█ Original
hello there είναι απλά ένα κείμενο χωρίς
hallo Привет
Hallo Привет
HALLO ПРИВЕТ
Contrairement à une opinion répandue ?
█ ToUpper
HELLO THERE ΕΊΝΑΙ ΑΠΛΆ ΈΝΑ ΚΕΊΜΕΝΟ ΧΩΡΊΣ
HALLO ПРИВЕТ
HALLO ПРИВЕТ
HALLO ПРИВЕТ
CONTRAIREMENT À UNE OPINION RÉPANDUE ?
█ ToLower
hello there είναι απλά ένα κείμενο χωρίσ
hallo привет
hallo привет
hallo привет
contrairement à une opinion répandue ?
Press any key to exit

使用 Visual C++ 编译器编译 - Powershell 和 CMD 使用 Windows 终端的输出 - 正确绘图✔️

wt


用Visual C++编译器编译-纯cmd的输出-局部绘图❗

cmd


用Visual C++编译器编译-纯PowerShell的输出-局部绘图❗

pwsh


在 WSL Ubuntu 20.04 中使用 Clang++ 编译 - Visual Studio Code 的输出 - 正确绘图 ✔️

wsl


使用 g++(Cygwin) 编译 - Visual Studio 代码的输出 + CygwinTerminal 的输出 + Windows 终端的输出 - 正确绘图 ✔️ 不同的 glyps 渲染

all


使用 Visual C++ 编译器编译 - Visual Studio 2019 的输出 - 正确绘图 ✔️

vs

,

编辑:我刚刚阅读了you should not use wchar_t for unicode,所以这个答案不完整。

查看我的重复标签

#include <iostream>
#include <string>
#include <algorithm>

using std::wcout;
using std::wstring;
#define endl '\n'

int main() {
    std::locale::global(std::locale("en_US.UTF8"));
    std::wcout.imbue(std::locale());
    auto& f = std::use_facet<std::ctype<wchar_t>>(std::locale());

    wstring string0(L"hello there είναι απλά ένα κείμενο χωρίς");
    wstring string1(L"hallo Привет");
    wstring string2(L"Hallo Привет");
    wstring string3(L"HALLO ПРИВЕТ");

    wstring output0 = string0;
    f.toupper(&output0[0],&output0[output0.size()]);
    wcout << "input string:  " << string0 << endl
         << "output string: " << output0 << endl;

    wstring output1 = string1;
    f.toupper(&output1[0],&output1[output1.size()]);
    wcout << "input string:  " << string1 << endl
         << "output string: " << output1 << endl;

    wstring output2 = string2;
    f.toupper(&output2[0],&output2[output2.size()]);
    wcout << "input string:  " << string2 << endl
         << "output string: " << output2 << endl;

    wstring output3 = string3;
    f.toupper(&output3[0],&output3[output3.size()]);
    wcout << "input string:  " << string3 << endl
         << "output string: " << output3 << endl;

    wstring string4 = L"Contrairement à une opinion répandue";

    wstring output4 = string4;
    f.toupper(&output4[0],&output4[output4.size()]);
    wcout << "input string:  " << string4 << endl
         << "output string: " << output4 << endl;
}

返回

input string:  hello there είναι απλά ένα κείμενο χωρίς
output string: HELLO THERE ΕΊΝΑΙ ΑΠΛΆ ΈΝΑ ΚΕΊΜΕΝΟ ΧΩΡΊΣ
input string:  hallo Привет
output string: HALLO ПРИВЕТ
input string:  Hallo Привет
output string: HALLO ПРИВЕТ
input string:  HALLO ПРИВЕТ
output string: HALLO ПРИВЕТ
input string:  Contrairement à une opinion répandue
output string: CONTRAIREMENT À UNE OPINION RÉPANDUE

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。