Rune Caster 1.0.0
Modern C++ Text Processing Framework
Loading...
Searching...
No Matches
unicode.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <cstdint>
4
5namespace rune_caster {
6namespace unicode {
7
8// 유니코드 카테고리
19
20// 유니코드 스크립트
33
34// 정규화 형식
36 NFC, // Canonical Decomposition, followed by Canonical Composition
37 NFD, // Canonical Decomposition
38 NFKC, // Compatibility Decomposition, followed by Canonical Composition
39 NFKD // Compatibility Decomposition
40};
41
42// 유니코드 문자 속성 검사 함수들
43constexpr bool is_ascii(char32_t cp) noexcept {
44 return cp <= 0x7F;
45}
46
47constexpr bool is_latin(char32_t cp) noexcept {
48 return (cp >= 0x0041 && cp <= 0x005A) || // A-Z
49 (cp >= 0x0061 && cp <= 0x007A) || // a-z
50 (cp >= 0x00C0 && cp <= 0x00FF); // Latin-1 Supplement
51}
52
53constexpr bool is_hangul(char32_t cp) noexcept {
54 return (cp >= 0x1100 && cp <= 0x11FF) || // Hangul Jamo
55 (cp >= 0x3130 && cp <= 0x318F) || // Hangul Compatibility Jamo
56 (cp >= 0xAC00 && cp <= 0xD7AF); // Hangul Syllables
57}
58
59constexpr bool is_hiragana(char32_t cp) noexcept {
60 return cp >= 0x3040 && cp <= 0x309F;
61}
62
63constexpr bool is_katakana(char32_t cp) noexcept {
64 return cp >= 0x30A0 && cp <= 0x30FF;
65}
66
67constexpr bool is_kanji(char32_t cp) noexcept {
68 return (cp >= 0x4E00 && cp <= 0x9FFF) || // CJK Unified Ideographs
69 (cp >= 0x3400 && cp <= 0x4DBF); // CJK Extension A
70}
71
72constexpr bool is_whitespace(char32_t cp) noexcept {
73 return cp == U' ' || cp == U'\t' || cp == U'\n' || cp == U'\r' ||
74 cp == U'\v' || cp == U'\f' || cp == 0x00A0; // Non-breaking space
75}
76
77constexpr bool is_letter(char32_t cp) noexcept {
78 return is_latin(cp) || is_hangul(cp) || is_hiragana(cp) ||
79 is_katakana(cp) || is_kanji(cp);
80}
81
82constexpr bool is_digit(char32_t cp) noexcept {
83 return cp >= U'0' && cp <= U'9';
84}
85
86constexpr bool is_punctuation(char32_t cp) noexcept {
87 return (cp >= 0x0021 && cp <= 0x002F) || // ! " # $ % & ' ( ) * + , - . /
88 (cp >= 0x003A && cp <= 0x0040) || // : ; < = > ? @
89 (cp >= 0x005B && cp <= 0x0060) || // [ \ ] ^ _ `
90 (cp >= 0x007B && cp <= 0x007E); // { | } ~
91}
92
93// 카테고리 결정 함수
94constexpr Category get_category(char32_t cp) noexcept {
95 if (is_letter(cp)) return Category::Letter;
96 if (is_digit(cp)) return Category::Number;
98 if (is_whitespace(cp)) return Category::Separator;
99 return Category::Unknown;
100}
101
102// 스크립트 결정 함수
103constexpr Script get_script(char32_t cp) noexcept {
104 if (is_latin(cp)) return Script::Latin;
105 if (is_hangul(cp)) return Script::Hangul;
106 if (is_hiragana(cp)) return Script::Hiragana;
107 if (is_katakana(cp)) return Script::Katakana;
108 if (is_kanji(cp)) return Script::Han;
109 return Script::Unknown;
110}
111
112} // namespace unicode
113} // namespace rune_caster
constexpr bool is_kanji(char32_t cp) noexcept
Definition unicode.hpp:67
constexpr bool is_whitespace(char32_t cp) noexcept
Definition unicode.hpp:72
constexpr bool is_hangul(char32_t cp) noexcept
Definition unicode.hpp:53
constexpr Category get_category(char32_t cp) noexcept
Definition unicode.hpp:94
constexpr bool is_ascii(char32_t cp) noexcept
Definition unicode.hpp:43
constexpr bool is_hiragana(char32_t cp) noexcept
Definition unicode.hpp:59
constexpr bool is_letter(char32_t cp) noexcept
Definition unicode.hpp:77
constexpr bool is_punctuation(char32_t cp) noexcept
Definition unicode.hpp:86
constexpr bool is_digit(char32_t cp) noexcept
Definition unicode.hpp:82
constexpr Script get_script(char32_t cp) noexcept
Definition unicode.hpp:103
constexpr bool is_katakana(char32_t cp) noexcept
Definition unicode.hpp:63
constexpr bool is_latin(char32_t cp) noexcept
Definition unicode.hpp:47