Rune Caster 1.0.0
Modern C++ Text Processing Framework
Loading...
Searching...
No Matches
rune.cpp
Go to the documentation of this file.
2#include <stdexcept>
3#include <codecvt>
4#include <locale>
5
6namespace rune_caster {
7
8// === Constructors (C++20 constexpr enhanced) ===
9
10Rune::Rune(char32_t codepoint, language::Code lang, std::string phoneme)
11 : codepoint_(codepoint)
12 , language_(lang)
13 , phoneme_(std::move(phoneme))
14{
15}
16
17// === Category and Script accessors (constexpr) ===
18
19constexpr unicode::Category Rune::category() const noexcept {
20 return unicode::get_category(codepoint_);
21}
22
23constexpr unicode::Script Rune::script() const noexcept {
24 return unicode::get_script(codepoint_);
25}
26
27// === Language-specific properties (API Design Document requirement) ===
28
29constexpr bool Rune::is_ascii() const noexcept {
30 return codepoint_ <= 0x7F;
31}
32
33constexpr bool Rune::is_latin() const noexcept {
34 // Basic Latin (U+0000-U+007F) and Latin-1 Supplement (U+0080-U+00FF)
35 // Latin Extended-A (U+0100-U+017F) and Latin Extended-B (U+0180-U+024F)
36 // Latin Extended Additional (U+1E00-U+1EFF)
37 return (codepoint_ <= 0x024F) ||
38 (codepoint_ >= 0x1E00 && codepoint_ <= 0x1EFF);
39}
40
41constexpr bool Rune::is_hangul() const noexcept {
42 // Hangul Jamo (U+1100-U+11FF)
43 // Hangul Compatibility Jamo (U+3130-U+318F)
44 // Hangul Syllables (U+AC00-U+D7AF)
45 return (codepoint_ >= 0x1100 && codepoint_ <= 0x11FF) ||
46 (codepoint_ >= 0x3130 && codepoint_ <= 0x318F) ||
47 (codepoint_ >= 0xAC00 && codepoint_ <= 0xD7AF);
48}
49
50constexpr bool Rune::is_hiragana() const noexcept {
51 // Hiragana (U+3040-U+309F)
52 return codepoint_ >= 0x3040 && codepoint_ <= 0x309F;
53}
54
55constexpr bool Rune::is_katakana() const noexcept {
56 // Katakana (U+30A0-U+30FF)
57 // Katakana Phonetic Extensions (U+31F0-U+31FF)
58 return (codepoint_ >= 0x30A0 && codepoint_ <= 0x30FF) ||
59 (codepoint_ >= 0x31F0 && codepoint_ <= 0x31FF);
60}
61
62constexpr bool Rune::is_kanji() const noexcept {
63 // CJK Unified Ideographs (U+4E00-U+9FFF)
64 // CJK Unified Ideographs Extension A (U+3400-U+4DBF)
65 // CJK Unified Ideographs Extension B (U+20000-U+2A6DF)
66 return (codepoint_ >= 0x4E00 && codepoint_ <= 0x9FFF) ||
67 (codepoint_ >= 0x3400 && codepoint_ <= 0x4DBF) ||
68 (codepoint_ >= 0x20000 && codepoint_ <= 0x2A6DF);
69}
70
71constexpr bool Rune::is_emoji() const noexcept {
72 // Emoticons (U+1F600-U+1F64F)
73 // Miscellaneous Symbols and Pictographs (U+1F300-U+1F5FF)
74 // Transport and Map Symbols (U+1F680-U+1F6FF)
75 // Supplemental Symbols and Pictographs (U+1F900-U+1F9FF)
76 return (codepoint_ >= 0x1F600 && codepoint_ <= 0x1F64F) ||
77 (codepoint_ >= 0x1F300 && codepoint_ <= 0x1F5FF) ||
78 (codepoint_ >= 0x1F680 && codepoint_ <= 0x1F6FF) ||
79 (codepoint_ >= 0x1F900 && codepoint_ <= 0x1F9FF) ||
80 (codepoint_ >= 0x2600 && codepoint_ <= 0x26FF) || // Miscellaneous Symbols
81 (codepoint_ >= 0x2700 && codepoint_ <= 0x27BF); // Dingbats
82}
83
84// === Conversion methods ===
85
86std::string Rune::to_utf8() const {
87 std::string result;
88
89 if (codepoint_ <= 0x7F) {
90 // 1-byte sequence
91 result.push_back(static_cast<char>(codepoint_));
92 } else if (codepoint_ <= 0x7FF) {
93 // 2-byte sequence
94 result.push_back(static_cast<char>(0xC0 | (codepoint_ >> 6)));
95 result.push_back(static_cast<char>(0x80 | (codepoint_ & 0x3F)));
96 } else if (codepoint_ <= 0xFFFF) {
97 // 3-byte sequence
98 result.push_back(static_cast<char>(0xE0 | (codepoint_ >> 12)));
99 result.push_back(static_cast<char>(0x80 | ((codepoint_ >> 6) & 0x3F)));
100 result.push_back(static_cast<char>(0x80 | (codepoint_ & 0x3F)));
101 } else if (codepoint_ <= 0x10FFFF) {
102 // 4-byte sequence
103 result.push_back(static_cast<char>(0xF0 | (codepoint_ >> 18)));
104 result.push_back(static_cast<char>(0x80 | ((codepoint_ >> 12) & 0x3F)));
105 result.push_back(static_cast<char>(0x80 | ((codepoint_ >> 6) & 0x3F)));
106 result.push_back(static_cast<char>(0x80 | (codepoint_ & 0x3F)));
107 } else {
108 throw std::invalid_argument("Invalid Unicode codepoint");
109 }
110
111 return result;
112}
113
114std::u16string Rune::to_utf16() const {
115 std::u16string result;
116
117 if (codepoint_ <= 0xFFFF) {
118 // BMP character
119 result.push_back(static_cast<char16_t>(codepoint_));
120 } else if (codepoint_ <= 0x10FFFF) {
121 // Surrogate pair
122 char32_t adjusted = codepoint_ - 0x10000;
123 char16_t high = 0xD800 + (adjusted >> 10);
124 char16_t low = 0xDC00 + (adjusted & 0x3FF);
125 result.push_back(high);
126 result.push_back(low);
127 } else {
128 throw std::invalid_argument("Invalid Unicode codepoint");
129 }
130
131 return result;
132}
133
134constexpr std::u32string Rune::to_utf32() const {
135 return std::u32string(1, codepoint_);
136}
137
138// === Factory methods ===
139
140Rune Rune::from_utf8(std::string_view utf8_char) {
141 if (utf8_char.empty()) {
142 throw std::invalid_argument("Empty UTF-8 string");
143 }
144
145 auto it = utf8_char.begin();
146 char32_t codepoint = 0;
147
148 unsigned char first = static_cast<unsigned char>(*it++);
149
150 if (first <= 0x7F) {
151 // 1-byte sequence
152 codepoint = first;
153 } else if ((first & 0xE0) == 0xC0) {
154 // 2-byte sequence
155 if (utf8_char.size() < 2) {
156 throw std::invalid_argument("Invalid UTF-8 sequence");
157 }
158 unsigned char second = static_cast<unsigned char>(*it++);
159 if ((second & 0xC0) != 0x80) {
160 throw std::invalid_argument("Invalid UTF-8 sequence");
161 }
162 codepoint = ((first & 0x1F) << 6) | (second & 0x3F);
163 } else if ((first & 0xF0) == 0xE0) {
164 // 3-byte sequence
165 if (utf8_char.size() < 3) {
166 throw std::invalid_argument("Invalid UTF-8 sequence");
167 }
168 unsigned char second = static_cast<unsigned char>(*it++);
169 unsigned char third = static_cast<unsigned char>(*it++);
170 if ((second & 0xC0) != 0x80 || (third & 0xC0) != 0x80) {
171 throw std::invalid_argument("Invalid UTF-8 sequence");
172 }
173 codepoint = ((first & 0x0F) << 12) | ((second & 0x3F) << 6) | (third & 0x3F);
174 } else if ((first & 0xF8) == 0xF0) {
175 // 4-byte sequence
176 if (utf8_char.size() < 4) {
177 throw std::invalid_argument("Invalid UTF-8 sequence");
178 }
179 unsigned char second = static_cast<unsigned char>(*it++);
180 unsigned char third = static_cast<unsigned char>(*it++);
181 unsigned char fourth = static_cast<unsigned char>(*it++);
182 if ((second & 0xC0) != 0x80 || (third & 0xC0) != 0x80 || (fourth & 0xC0) != 0x80) {
183 throw std::invalid_argument("Invalid UTF-8 sequence");
184 }
185 codepoint = ((first & 0x07) << 18) | ((second & 0x3F) << 12) |
186 ((third & 0x3F) << 6) | (fourth & 0x3F);
187 } else {
188 throw std::invalid_argument("Invalid UTF-8 sequence");
189 }
190
191 return Rune(codepoint);
192}
193
194Rune Rune::from_utf8(std::string_view utf8_char, language::Code lang) {
195 Rune rune = from_utf8(utf8_char);
196 rune.set_language(lang);
197 return rune;
198}
199
200Rune Rune::from_utf16(std::u16string_view utf16_char) {
201 if (utf16_char.empty()) {
202 throw std::invalid_argument("Empty UTF-16 string");
203 }
204
205 char32_t codepoint = 0;
206 char16_t first = utf16_char[0];
207
208 if (first >= 0xD800 && first <= 0xDBFF) {
209 // High surrogate
210 if (utf16_char.size() < 2) {
211 throw std::invalid_argument("Invalid UTF-16 surrogate pair");
212 }
213 char16_t second = utf16_char[1];
214 if (second < 0xDC00 || second > 0xDFFF) {
215 throw std::invalid_argument("Invalid UTF-16 surrogate pair");
216 }
217 codepoint = 0x10000 + ((first & 0x3FF) << 10) + (second & 0x3FF);
218 } else if (first >= 0xDC00 && first <= 0xDFFF) {
219 throw std::invalid_argument("Invalid UTF-16 sequence: unexpected low surrogate");
220 } else {
221 // BMP character
222 codepoint = first;
223 }
224
225 return Rune(codepoint);
226}
227
228// === Language detection is now defined inline in header ===
229
230} // namespace rune_caster
constexpr bool is_emoji() const noexcept
Check if this is an emoji character.
Definition rune.cpp:71
constexpr bool is_kanji() const noexcept
Check if this is a Kanji character.
Definition rune.cpp:62
constexpr bool is_hangul() const noexcept
Check if this is a Hangul (Korean) character.
Definition rune.cpp:41
static Rune from_utf8(std::string_view utf8_char)
Create a Rune from a UTF-8 character.
Definition rune.cpp:140
constexpr Rune() noexcept
Default constructor (creates null character)
Definition rune.hpp:274
constexpr bool is_katakana() const noexcept
Check if this is a Katakana character.
Definition rune.cpp:55
constexpr bool is_hiragana() const noexcept
Check if this is a Hiragana character.
Definition rune.cpp:50
constexpr bool is_latin() const noexcept
Check if this is a Latin script character.
Definition rune.cpp:33
constexpr unicode::Category category() const noexcept
Get the Unicode category.
Definition rune.cpp:19
const std::string & phoneme() const noexcept
Get the phonetic representation.
Definition rune.hpp:90
constexpr bool is_ascii() const noexcept
Check if this is an ASCII character.
Definition rune.cpp:29
constexpr void set_language(language::Code lang) noexcept
Set the language code.
Definition rune.hpp:98
std::string to_utf8() const
Convert to UTF-8 string.
Definition rune.cpp:86
static Rune from_utf16(std::u16string_view utf16_char)
Create a Rune from a UTF-16 character.
Definition rune.cpp:200
constexpr unicode::Script script() const noexcept
Get the Unicode script.
Definition rune.cpp:23
constexpr std::u32string to_utf32() const
Convert to UTF-32 string.
Definition rune.cpp:134
std::u16string to_utf16() const
Convert to UTF-16 string.
Definition rune.cpp:114
constexpr char32_t codepoint() const noexcept
Get the Unicode codepoint.
Definition rune.hpp:66
Code
Enumeration of supported language codes.
Definition language.hpp:43
constexpr Category get_category(char32_t cp) noexcept
Definition unicode.hpp:94
constexpr Script get_script(char32_t cp) noexcept
Definition unicode.hpp:103