fix unicode issues

This commit is contained in:
Hamcha 2024-10-25 22:26:00 +02:00
parent e09f185bb1
commit 8e830d03cb
Signed by: hamcha
GPG key ID: 1669C533B8CF6D89
3 changed files with 60 additions and 48 deletions

2
Cargo.lock generated
View file

@ -235,7 +235,7 @@ dependencies = [
[[package]] [[package]]
name = "weeblib" name = "weeblib"
version = "0.1.1" version = "0.1.2"
dependencies = [ dependencies = [
"console_error_panic_hook", "console_error_panic_hook",
"wasm-bindgen", "wasm-bindgen",

View file

@ -1,6 +1,6 @@
[package] [package]
name = "weeblib" name = "weeblib"
version = "0.1.1" version = "0.1.2"
edition = "2021" edition = "2021"
authors = ["Hamcha <hamcha@crunchy.rocks>"] authors = ["Hamcha <hamcha@crunchy.rocks>"]

View file

@ -1,7 +1,7 @@
use crate::syllable::{Syllable, Syllable::*}; use crate::syllable::{Syllable, Syllable::*};
fn match4(s: &str) -> Option<Syllable> { fn match4(s: &[char]) -> Option<Syllable> {
match s { match s.iter().collect::<String>().as_str() {
"scia" => Some(Sha), "scia" => Some(Sha),
"scio" => Some(Sho), "scio" => Some(Sho),
"sciu" => Some(Shu), "sciu" => Some(Shu),
@ -15,8 +15,8 @@ fn match4(s: &str) -> Option<Syllable> {
} }
} }
fn match3(s: &str) -> Option<Syllable> { fn match3(s: &[char]) -> Option<Syllable> {
match s { match s.iter().collect::<String>().as_str() {
"sci" => Some(Shi), "sci" => Some(Shi),
"chi" => Some(Ki), "chi" => Some(Ki),
"che" => Some(Ke), "che" => Some(Ke),
@ -48,8 +48,8 @@ fn match3(s: &str) -> Option<Syllable> {
} }
} }
fn match2(s: &str) -> Option<Syllable> { fn match2(s: &[char]) -> Option<Syllable> {
match s { match s.iter().collect::<String>().as_str() {
"ka" => Some(Ka), "ka" => Some(Ka),
"ca" => Some(Ka), "ca" => Some(Ka),
"ki" => Some(Ki), "ki" => Some(Ki),
@ -133,33 +133,33 @@ fn match2(s: &str) -> Option<Syllable> {
} }
} }
fn match1(s: &str) -> Option<Syllable> { fn match1(s: &char) -> Option<Syllable> {
match s { match s {
"a" => Some(A), 'a' => Some(A),
"i" => Some(I), 'i' => Some(I),
"u" => Some(U), 'u' => Some(U),
"e" => Some(E), 'e' => Some(E),
"o" => Some(O), 'o' => Some(O),
"n" => Some(N), 'n' => Some(N),
"s" => Some(Su), 's' => Some(Su),
"r" => Some(Ru), 'r' => Some(Ru),
"b" => Some(Bu), 'b' => Some(Bu),
"m" => Some(Mu), 'm' => Some(Mu),
"y" => Some(Yu), 'y' => Some(Yu),
"w" => Some(Wa), 'w' => Some(Wa),
"g" => Some(Gu), 'g' => Some(Gu),
"z" => Some(Zu), 'z' => Some(Zu),
"d" => Some(De), 'd' => Some(De),
"p" => Some(Pu), 'p' => Some(Pu),
"k" => Some(Ku), 'k' => Some(Ku),
"f" => Some(Fu), 'f' => Some(Fu),
"c" => Some(Ku), 'c' => Some(Ku),
"t" => Some(Tsu), 't' => Some(Tsu),
"v" => Some(Vu), 'v' => Some(Vu),
"l" => Some(Ru), 'l' => Some(Ru),
"h" => Some(LongVowel), 'h' => Some(LongVowel),
"j" => Some(I), 'j' => Some(I),
"q" => Some(Kyu), 'q' => Some(Kyu),
_ => None, _ => None,
} }
} }
@ -186,23 +186,24 @@ pub(crate) fn romanize(word: &str) -> Vec<Syllable> {
.chars() .chars()
.map(remove_accents) .map(remove_accents)
.collect(); .collect();
let mut remaining = lowercase.clone(); let mut remaining: Vec<char> = lowercase.clone().chars().collect();
let mut syllables = Vec::new(); let mut syllables = Vec::new();
while remaining.len() > 0 { while remaining.len() > 0 {
if !remaining.as_bytes()[0].is_ascii_alphabetic() { let next = remaining[0];
syllables.push(NonAlpha(remaining.as_bytes()[0] as char)); if !next.is_alphabetic() {
remaining = remaining[1..].to_string(); syllables.push(NonAlpha(next));
remaining.remove(0);
continue; continue;
} }
// Check for double consonants // Check for double consonants
if remaining.len() >= 3 && remaining.as_bytes()[0] == remaining.as_bytes()[1] { if remaining.len() >= 3 && remaining[0] == remaining[1] {
syllables.push(LittleTsu); syllables.push(LittleTsu);
remaining = remaining[1..].to_string(); remaining.remove(0);
continue; continue;
} }
// Check for X // Check for X
if remaining.as_bytes()[0] == b'x' { if remaining[0] == 'x' {
syllables.push(Ku); syllables.push(Ku);
if remaining.len() < 2 { if remaining.len() < 2 {
// Last letter // Last letter
@ -210,36 +211,42 @@ pub(crate) fn romanize(word: &str) -> Vec<Syllable> {
break; break;
} }
// Replace X with S // Replace X with S
remaining = ["s", &remaining[1..]].concat(); remaining[0] = 's';
} }
// Check for 4 letter patterns, then 3, etc. // Check for 4 letter patterns, then 3, etc.
if remaining.len() >= 4 { if remaining.len() >= 4 {
if let Some(syllable) = match4(&remaining[..4]) { if let Some(syllable) = match4(&remaining[..4]) {
syllables.push(syllable); syllables.push(syllable);
remaining = remaining[4..].to_string(); remaining.remove(0);
remaining.remove(0);
remaining.remove(0);
remaining.remove(0);
continue; continue;
} }
} }
if remaining.len() >= 3 { if remaining.len() >= 3 {
if let Some(syllable) = match3(&remaining[..3]) { if let Some(syllable) = match3(&remaining[..3]) {
syllables.push(syllable); syllables.push(syllable);
remaining = remaining[3..].to_string(); remaining.remove(0);
remaining.remove(0);
remaining.remove(0);
continue; continue;
} }
} }
if remaining.len() >= 2 { if remaining.len() >= 2 {
if let Some(syllable) = match2(&remaining[..2]) { if let Some(syllable) = match2(&remaining[..2]) {
syllables.push(syllable); syllables.push(syllable);
remaining = remaining[2..].to_string(); remaining.remove(0);
remaining.remove(0);
continue; continue;
} }
} }
if let Some(syllable) = match1(&remaining[..1]) { if let Some(syllable) = match1(&remaining[0]) {
syllables.push(syllable); syllables.push(syllable);
remaining = remaining[1..].to_string(); remaining.remove(0);
continue; continue;
} }
panic!("No match found for {} ({})", remaining, word); panic!("No match found for {} ({})", remaining.iter().collect::<String>(), word);
} }
syllables syllables
@ -345,6 +352,11 @@ mod tests {
} }
} }
#[test]
fn test_utf8_symbol() {
assert_eq!(romanize("Some — test"), vec![So, Me, NonAlpha(' '), NonAlpha('—'), NonAlpha(' '), Te, Su, Tsu]);
}
#[test] #[test]
fn test_dictionary() { fn test_dictionary() {
// Read dictionary file // Read dictionary file