Rune Caster 1.0.0
Modern C++ Text Processing Framework
Loading...
Searching...
No Matches
rune.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <cstdint>
4#include <string>
5#include <string_view>
6
7#include "unicode.hpp"
8#include "language.hpp"
9
10namespace rune_caster {
11
23class Rune {
24public:
25 // === Constructors (C++20 constexpr enhanced) ===
26
30 constexpr Rune() noexcept;
31
36 constexpr explicit Rune(char32_t codepoint) noexcept;
37
43 constexpr Rune(char32_t codepoint, language::Code lang) noexcept;
44
51 Rune(char32_t codepoint, language::Code lang, std::string phoneme);
52
53 // === Copy/Move semantics ===
54 constexpr Rune(const Rune&) = default;
55 constexpr Rune(Rune&&) noexcept = default;
56 constexpr Rune& operator=(const Rune&) = default;
57 constexpr Rune& operator=(Rune&&) noexcept = default;
58 ~Rune() = default;
59
60 // === Accessors (C++20 constexpr) ===
61
66 [[nodiscard]] constexpr char32_t codepoint() const noexcept { return codepoint_; }
67
72 [[nodiscard]] constexpr language::Code language() const noexcept { return language_; }
73
78 [[nodiscard]] constexpr unicode::Category category() const noexcept;
79
84 [[nodiscard]] constexpr unicode::Script script() const noexcept;
85
90 [[nodiscard]] const std::string& phoneme() const noexcept { return phoneme_; }
91
92 // === Mutators ===
93
98 constexpr void set_language(language::Code lang) noexcept { language_ = lang; }
99
104 void set_phoneme(std::string phoneme) { phoneme_ = std::move(phoneme); }
105
106 // === Enhanced Character classification (C++20 constexpr) ===
107
112 [[nodiscard]] constexpr bool is_vowel() const noexcept;
113
118 [[nodiscard]] constexpr bool is_consonant() const noexcept;
119
124 [[nodiscard]] constexpr bool is_letter() const noexcept;
125
130 [[nodiscard]] constexpr bool is_digit() const noexcept;
131
136 [[nodiscard]] constexpr bool is_whitespace() const noexcept;
137
142 [[nodiscard]] constexpr bool is_punctuation() const noexcept;
143
144 // === Language-specific properties (API Design Document requirement) ===
145
150 [[nodiscard]] constexpr bool is_ascii() const noexcept;
151
156 [[nodiscard]] constexpr bool is_latin() const noexcept;
157
162 [[nodiscard]] constexpr bool is_hangul() const noexcept;
163
168 [[nodiscard]] constexpr bool is_hiragana() const noexcept;
169
174 [[nodiscard]] constexpr bool is_katakana() const noexcept;
175
180 [[nodiscard]] constexpr bool is_kanji() const noexcept;
181
186 [[nodiscard]] constexpr bool is_emoji() const noexcept;
187
188 // === Conversion ===
189
194 [[nodiscard]] std::string to_utf8() const;
195
200 [[nodiscard]] std::u16string to_utf16() const;
201
206 [[nodiscard]] constexpr std::u32string to_utf32() const;
207
208 // === Factory methods (API Design Document requirement) ===
209
216 static Rune from_utf8(std::string_view utf8_char);
217
225 static Rune from_utf8(std::string_view utf8_char, language::Code lang);
226
233 static Rune from_utf16(std::u16string_view utf16_char);
234
235 // === Comparison operators (C++20 three-way comparison) ===
236
240 [[nodiscard]] constexpr auto operator<=>(const Rune& other) const noexcept = default;
241
245 [[nodiscard]] constexpr bool operator==(const Rune& other) const noexcept = default;
246
247private:
248 char32_t codepoint_;
249 language::Code language_;
250 std::string phoneme_;
251
257 static constexpr language::Code detect_language(char32_t cp) noexcept;
258};
259
260// === User-defined literals (API Design Document requirement) ===
261
269constexpr Rune operator""_rune(char32_t cp) noexcept {
270 return Rune(cp);
271}
272
273// === Inline definitions for constructors (header-level) ===
274inline constexpr Rune::Rune() noexcept : codepoint_(U'\0'), language_(language::Code::Unknown), phoneme_() {}
275inline constexpr Rune::Rune(char32_t codepoint) noexcept : codepoint_(codepoint), language_(Rune::detect_language(codepoint)), phoneme_() {}
276inline constexpr Rune::Rune(char32_t codepoint, language::Code lang) noexcept : codepoint_(codepoint), language_(lang), phoneme_() {}
277
278// === Inline definitions for classification methods ===
279inline constexpr bool Rune::is_letter() const noexcept { return unicode::is_letter(codepoint_); }
280inline constexpr bool Rune::is_digit() const noexcept { return unicode::is_digit(codepoint_); }
281inline constexpr bool Rune::is_whitespace() const noexcept { return unicode::is_whitespace(codepoint_); }
282inline constexpr bool Rune::is_punctuation() const noexcept { return unicode::is_punctuation(codepoint_); }
283
284// === Language detection (inline implementation) ===
285inline constexpr language::Code Rune::detect_language(char32_t cp) noexcept {
286 // Korean (Hangul)
287 if ((cp >= 0x1100 && cp <= 0x11FF) || // Hangul Jamo
288 (cp >= 0x3130 && cp <= 0x318F) || // Hangul Compatibility Jamo
289 (cp >= 0xAC00 && cp <= 0xD7AF)) { // Hangul Syllables
291 }
292
293 // Japanese
294 if ((cp >= 0x3040 && cp <= 0x309F) || // Hiragana
295 (cp >= 0x30A0 && cp <= 0x30FF) || // Katakana
296 (cp >= 0x31F0 && cp <= 0x31FF)) { // Katakana Phonetic Extensions
298 }
299
300 // Chinese (CJK Ideographs)
301 if ((cp >= 0x4E00 && cp <= 0x9FFF) || // CJK Unified Ideographs
302 (cp >= 0x3400 && cp <= 0x4DBF) || // CJK Extension A
303 (cp >= 0x20000 && cp <= 0x2A6DF)) { // CJK Extension B
305 }
306
307 // English/Latin
308 if ((cp <= 0x024F) || // Basic Latin + Latin Extended
309 (cp >= 0x1E00 && cp <= 0x1EFF)) { // Latin Extended Additional
311 }
312
314}
315inline constexpr bool Rune::is_vowel() const noexcept {
316 char32_t c = codepoint_;
317
318 // 라틴 알파벳 모음 (영어)
319 if (c == U'a' || c == U'e' || c == U'i' || c == U'o' || c == U'u' ||
320 c == U'A' || c == U'E' || c == U'I' || c == U'O' || c == U'U') {
321 return true;
322 }
323
324 // 한글 모음 (ㅏ-ㅣ 범위)
325 if (c >= U'ㅏ' && c <= U'ㅣ') {
326 return true;
327 }
328
329 // 일본어 모음 (히라가나)
330 if (c == U'あ' || c == U'い' || c == U'う' || c == U'え' || c == U'お') {
331 return true;
332 }
333
334 // 일본어 모음 (가타카나)
335 if (c == U'ア' || c == U'イ' || c == U'ウ' || c == U'エ' || c == U'オ') {
336 return true;
337 }
338
339 return false;
340}
341inline constexpr bool Rune::is_consonant() const noexcept {
342 return is_letter() && !is_vowel();
343}
344// inline definitions for previously constexpr functions already specialized in rune.cpp (will be identical)
345
346} // namespace rune_caster
Represents a single textual unit with Unicode and linguistic properties.
Definition rune.hpp:23
constexpr bool is_emoji() const noexcept
Check if this is an emoji character.
Definition rune.cpp:71
constexpr Rune(const Rune &)=default
constexpr bool is_kanji() const noexcept
Check if this is a Kanji character.
Definition rune.cpp:62
constexpr bool is_hangul() const noexcept
Check if this is a Hangul (Korean) character.
Definition rune.cpp:41
constexpr bool is_letter() const noexcept
Check if this rune is a letter.
Definition rune.hpp:279
constexpr Rune(Rune &&) noexcept=default
static Rune from_utf8(std::string_view utf8_char)
Create a Rune from a UTF-8 character.
Definition rune.cpp:140
constexpr Rune() noexcept
Default constructor (creates null character)
Definition rune.hpp:274
constexpr bool is_consonant() const noexcept
Check if this rune represents a consonant.
Definition rune.hpp:341
constexpr bool is_whitespace() const noexcept
Check if this rune is whitespace.
Definition rune.hpp:281
constexpr bool is_katakana() const noexcept
Check if this is a Katakana character.
Definition rune.cpp:55
constexpr bool is_hiragana() const noexcept
Check if this is a Hiragana character.
Definition rune.cpp:50
constexpr language::Code language() const noexcept
Get the language code.
Definition rune.hpp:72
constexpr bool is_latin() const noexcept
Check if this is a Latin script character.
Definition rune.cpp:33
constexpr unicode::Category category() const noexcept
Get the Unicode category.
Definition rune.cpp:19
void set_phoneme(std::string phoneme)
Set the phonetic representation.
Definition rune.hpp:104
const std::string & phoneme() const noexcept
Get the phonetic representation.
Definition rune.hpp:90
constexpr bool is_ascii() const noexcept
Check if this is an ASCII character.
Definition rune.cpp:29
constexpr void set_language(language::Code lang) noexcept
Set the language code.
Definition rune.hpp:98
std::string to_utf8() const
Convert to UTF-8 string.
Definition rune.cpp:86
constexpr bool is_punctuation() const noexcept
Check if this rune is punctuation.
Definition rune.hpp:282
static Rune from_utf16(std::u16string_view utf16_char)
Create a Rune from a UTF-16 character.
Definition rune.cpp:200
constexpr unicode::Script script() const noexcept
Get the Unicode script.
Definition rune.cpp:23
constexpr bool is_digit() const noexcept
Check if this rune is a digit.
Definition rune.hpp:280
constexpr std::u32string to_utf32() const
Convert to UTF-32 string.
Definition rune.cpp:134
std::u16string to_utf16() const
Convert to UTF-16 string.
Definition rune.cpp:114
constexpr bool is_vowel() const noexcept
Check if this rune represents a vowel.
Definition rune.hpp:315
constexpr char32_t codepoint() const noexcept
Get the Unicode codepoint.
Definition rune.hpp:66
Language identification and localization support.
Language detection and identification functionality.
Definition concepts.hpp:15
Code
Enumeration of supported language codes.
Definition language.hpp:43
@ Chinese
中文 (zh-CN) - Chinese (Simplified)
Definition language.hpp:48
@ English
English (en-US) - English.
Definition language.hpp:46
@ Unknown
Unknown or undetected language.
Definition language.hpp:44
@ Korean
한국어 (ko-KR) - Korean
Definition language.hpp:45
@ Japanese
日本語 (ja-JP) - Japanese
Definition language.hpp:47
constexpr bool is_whitespace(char32_t cp) noexcept
Definition unicode.hpp:72
constexpr bool is_letter(char32_t cp) noexcept
Definition unicode.hpp:77
constexpr bool is_punctuation(char32_t cp) noexcept
Definition unicode.hpp:86
constexpr bool is_digit(char32_t cp) noexcept
Definition unicode.hpp:82