Rune Caster 1.0.0
Modern C++ Text Processing Framework
Loading...
Searching...
No Matches
rune_sequence.cpp
Go to the documentation of this file.
2#include <algorithm>
3#include <stdexcept>
4#include <map>
5
6namespace rune_caster {
7
8// === Constructors ===
9
11 : runes_()
12 , primary_language_(primary_lang)
13{
14}
15
16RuneString::RuneString(std::string_view utf8)
17 : RuneString(from_utf8(utf8))
18{
19}
20
21RuneString::RuneString(std::u16string_view utf16)
22 : RuneString(from_utf16(utf16))
23{
24}
25
26RuneString::RuneString(std::u32string_view utf32)
27 : RuneString(from_utf32(utf32))
28{
29}
30
31RuneString::RuneString(std::initializer_list<Rune> runes)
32 : runes_(runes)
33 , primary_language_(language::Code::Unknown)
34{
35 // Auto-detect primary language from the most frequent language in the sequence
36 if (!runes_.empty()) {
37 std::map<language::Code, size_t> language_counts;
38 for (const auto& rune : runes_) {
39 if (rune.language() != language::Code::Unknown) {
40 ++language_counts[rune.language()];
41 }
42 }
43
44 if (!language_counts.empty()) {
45 primary_language_ = std::max_element(language_counts.begin(), language_counts.end(),
46 [](const auto& a, const auto& b) { return a.second < b.second; })->first;
47 }
48 }
49}
50
51// === Conversion methods ===
52
53std::string RuneString::to_utf8() const {
54 std::string result;
55 result.reserve(size() * 2); // Rough estimate for UTF-8 size
56
57 for (const auto& rune : runes_) {
58 result += rune.to_utf8();
59 }
60
61 return result;
62}
63
64std::u16string RuneString::to_utf16() const {
65 std::u16string result;
66 result.reserve(size() * 2); // Rough estimate for UTF-16 size
67
68 for (const auto& rune : runes_) {
69 auto utf16_char = rune.to_utf16();
70 result += utf16_char;
71 }
72
73 return result;
74}
75
76std::u32string RuneString::to_utf32() const {
77 std::u32string result;
78 result.reserve(size());
79
80 for (const auto& rune : runes_) {
81 result.push_back(rune.codepoint());
82 }
83
84 return result;
85}
86
87// === String operations ===
88
90 runes_.insert(runes_.end(), other.runes_.begin(), other.runes_.end());
91 return *this;
92}
93
95 runes_.push_back(rune);
96 return *this;
97}
98
99RuneString& RuneString::append(std::string_view utf8) {
100 auto other = from_utf8(utf8);
101 return append(other);
102}
103
105 if (start >= size()) {
106 return RuneString(primary_language_);
107 }
108
109 size_type actual_length = (length == npos) ? (size() - start) : std::min(length, size() - start);
110
111 RuneString result(primary_language_);
112 result.runes_.assign(runes_.begin() + start, runes_.begin() + start + actual_length);
113
114 return result;
115}
116
117// === Search operations ===
118
119RuneString::size_type RuneString::find(const Rune& rune, size_type pos) const noexcept {
120 if (pos >= size()) {
121 return npos;
122 }
123
124 auto it = std::find(runes_.begin() + pos, runes_.end(), rune);
125 return (it != runes_.end()) ? std::distance(runes_.begin(), it) : npos;
126}
127
129 if (str.empty()) {
130 return pos;
131 }
132
133 if (pos >= size() || str.size() > size() - pos) {
134 return npos;
135 }
136
137 auto it = std::search(runes_.begin() + pos, runes_.end(),
138 str.runes_.begin(), str.runes_.end());
139
140 return (it != runes_.end()) ? std::distance(runes_.begin(), it) : npos;
141}
142
143bool RuneString::contains(const Rune& rune) const noexcept {
144 return find(rune) != npos;
145}
146
147bool RuneString::contains(const RuneString& str) const noexcept {
148 return find(str) != npos;
149}
150
151// === Factory methods ===
152
153RuneString RuneString::from_utf8(std::string_view utf8_text) {
154 RuneString result;
155
156 if (utf8_text.empty()) {
157 return result;
158}
159
160 // Reserve space (rough estimate)
161 result.runes_.reserve(utf8_text.size());
162
163 std::map<language::Code, size_t> language_counts;
164
165 size_t i = 0;
166 while (i < utf8_text.size()) {
167 // Find the end of current UTF-8 character
168 size_t char_length = 1;
169 unsigned char first_byte = static_cast<unsigned char>(utf8_text[i]);
170
171 if (first_byte <= 0x7F) {
172 char_length = 1;
173 } else if ((first_byte & 0xE0) == 0xC0) {
174 char_length = 2;
175 } else if ((first_byte & 0xF0) == 0xE0) {
176 char_length = 3;
177 } else if ((first_byte & 0xF8) == 0xF0) {
178 char_length = 4;
179 } else {
180 // Invalid UTF-8, skip this byte
181 ++i;
182 continue;
183 }
184
185 if (i + char_length > utf8_text.size()) {
186 break; // Incomplete character at end of string
187 }
188
189 try {
190 std::string_view char_view = utf8_text.substr(i, char_length);
191 Rune rune = Rune::from_utf8(char_view);
192 result.runes_.push_back(rune);
193
194 // Count languages for primary language detection
195 if (rune.language() != language::Code::Unknown) {
196 ++language_counts[rune.language()];
197 }
198 } catch (const std::invalid_argument&) {
199 // Skip invalid UTF-8 sequences
200 }
201
202 i += char_length;
203 }
204
205 // Set primary language based on most frequent language
206 if (!language_counts.empty()) {
207 result.primary_language_ = std::max_element(language_counts.begin(), language_counts.end(),
208 [](const auto& a, const auto& b) { return a.second < b.second; })->first;
209 }
210
211 return result;
212}
213
214RuneString RuneString::from_utf8(std::string_view utf8_text, language::Code lang) {
215 RuneString result = from_utf8(utf8_text);
216 result.set_primary_language(lang);
217
218 // Also set language hint for all runes if they don't have a specific language
219 for (auto& rune : result.runes_) {
220 if (rune.language() == language::Code::Unknown) {
221 rune.set_language(lang);
222 }
223 }
224
225 return result;
226}
227
228RuneString RuneString::from_utf16(std::u16string_view utf16_text) {
229 RuneString result;
230
231 if (utf16_text.empty()) {
232 return result;
233 }
234
235 result.runes_.reserve(utf16_text.size());
236 std::map<language::Code, size_t> language_counts;
237
238 size_t i = 0;
239 while (i < utf16_text.size()) {
240 try {
241 char16_t first = utf16_text[i];
242
243 if (first >= 0xD800 && first <= 0xDBFF) {
244 // High surrogate - need to get the low surrogate too
245 if (i + 1 >= utf16_text.size()) {
246 break; // Incomplete surrogate pair
247 }
248 std::u16string_view char_view = utf16_text.substr(i, 2);
249 Rune rune = Rune::from_utf16(char_view);
250 result.runes_.push_back(rune);
251
252 if (rune.language() != language::Code::Unknown) {
253 ++language_counts[rune.language()];
254 }
255
256 i += 2;
257 } else if (first >= 0xDC00 && first <= 0xDFFF) {
258 // Unexpected low surrogate - skip
259 ++i;
260 } else {
261 // BMP character
262 std::u16string_view char_view = utf16_text.substr(i, 1);
263 Rune rune = Rune::from_utf16(char_view);
264 result.runes_.push_back(rune);
265
266 if (rune.language() != language::Code::Unknown) {
267 ++language_counts[rune.language()];
268 }
269
270 ++i;
271 }
272 } catch (const std::invalid_argument&) {
273 // Skip invalid UTF-16 sequences
274 ++i;
275 }
276 }
277
278 // Set primary language
279 if (!language_counts.empty()) {
280 result.primary_language_ = std::max_element(language_counts.begin(), language_counts.end(),
281 [](const auto& a, const auto& b) { return a.second < b.second; })->first;
282 }
283
284 return result;
285}
286
287RuneString RuneString::from_utf32(std::u32string_view utf32_text) {
288 RuneString result;
289
290 if (utf32_text.empty()) {
291 return result;
292 }
293
294 result.runes_.reserve(utf32_text.size());
295 std::map<language::Code, size_t> language_counts;
296
297 for (char32_t codepoint : utf32_text) {
298 try {
299 Rune rune(codepoint);
300 result.runes_.push_back(rune);
301
302 if (rune.language() != language::Code::Unknown) {
303 ++language_counts[rune.language()];
304 }
305 } catch (const std::invalid_argument&) {
306 // Skip invalid codepoints
307 }
308 }
309
310 // Set primary language
311 if (!language_counts.empty()) {
312 result.primary_language_ = std::max_element(language_counts.begin(), language_counts.end(),
313 [](const auto& a, const auto& b) { return a.second < b.second; })->first;
314 }
315
316 return result;
317}
318
319} // namespace rune_caster
bool contains(const Rune &rune) const noexcept
Check if the string contains a Rune.
static RuneString from_utf16(std::u16string_view utf16_text)
Create a RuneString from UTF-16 text.
RuneString() noexcept=default
Default constructor.
static RuneString from_utf8(std::string_view utf8_text)
Create a RuneString from UTF-8 text.
static constexpr size_type npos
void set_primary_language(language::Code lang) noexcept
Set the primary language of the sequence.
size_type find(const Rune &rune, size_type pos=0) const noexcept
Find first occurrence of a Rune.
static RuneString from_utf32(std::u32string_view utf32_text)
Create a RuneString from UTF-32 text.
std::u16string to_utf16() const
Convert the sequence to UTF-16 string.
size_type size() const noexcept
RuneString & append(const RuneString &other)
Append another RuneString.
std::u32string to_utf32() const
Convert the sequence to UTF-32 string.
RuneString substr(size_type start, size_type length=npos) const
Create a substring.
std::string to_utf8() const
Convert the sequence to UTF-8 string.
size_type length() const noexcept
Represents a single textual unit with Unicode and linguistic properties.
Definition rune.hpp:23
static Rune from_utf8(std::string_view utf8_char)
Create a Rune from a UTF-8 character.
Definition rune.cpp:140
constexpr language::Code language() const noexcept
Get the language code.
Definition rune.hpp:72
static Rune from_utf16(std::u16string_view utf16_char)
Create a Rune from a UTF-16 character.
Definition rune.cpp:200
Language detection and identification functionality.
Definition concepts.hpp:15
Code
Enumeration of supported language codes.
Definition language.hpp:43
@ Unknown
Unknown or undetected language.
Definition language.hpp:44