fix unicode issues

This commit is contained in:
Hamcha 2024-10-25 22:26:00 +02:00
parent e09f185bb1
commit 8e830d03cb
Signed by: hamcha
GPG key ID: 1669C533B8CF6D89
3 changed files with 60 additions and 48 deletions

2
Cargo.lock generated
View file

@ -235,7 +235,7 @@ dependencies = [
[[package]]
name = "weeblib"
version = "0.1.1"
version = "0.1.2"
dependencies = [
"console_error_panic_hook",
"wasm-bindgen",

View file

@ -1,6 +1,6 @@
[package]
name = "weeblib"
version = "0.1.1"
version = "0.1.2"
edition = "2021"
authors = ["Hamcha <hamcha@crunchy.rocks>"]

View file

@ -1,7 +1,7 @@
use crate::syllable::{Syllable, Syllable::*};
fn match4(s: &str) -> Option<Syllable> {
match s {
fn match4(s: &[char]) -> Option<Syllable> {
match s.iter().collect::<String>().as_str() {
"scia" => Some(Sha),
"scio" => Some(Sho),
"sciu" => Some(Shu),
@ -15,8 +15,8 @@ fn match4(s: &str) -> Option<Syllable> {
}
}
fn match3(s: &str) -> Option<Syllable> {
match s {
fn match3(s: &[char]) -> Option<Syllable> {
match s.iter().collect::<String>().as_str() {
"sci" => Some(Shi),
"chi" => Some(Ki),
"che" => Some(Ke),
@ -48,8 +48,8 @@ fn match3(s: &str) -> Option<Syllable> {
}
}
fn match2(s: &str) -> Option<Syllable> {
match s {
fn match2(s: &[char]) -> Option<Syllable> {
match s.iter().collect::<String>().as_str() {
"ka" => Some(Ka),
"ca" => Some(Ka),
"ki" => Some(Ki),
@ -133,33 +133,33 @@ fn match2(s: &str) -> Option<Syllable> {
}
}
fn match1(s: &str) -> Option<Syllable> {
fn match1(s: &char) -> Option<Syllable> {
match s {
"a" => Some(A),
"i" => Some(I),
"u" => Some(U),
"e" => Some(E),
"o" => Some(O),
"n" => Some(N),
"s" => Some(Su),
"r" => Some(Ru),
"b" => Some(Bu),
"m" => Some(Mu),
"y" => Some(Yu),
"w" => Some(Wa),
"g" => Some(Gu),
"z" => Some(Zu),
"d" => Some(De),
"p" => Some(Pu),
"k" => Some(Ku),
"f" => Some(Fu),
"c" => Some(Ku),
"t" => Some(Tsu),
"v" => Some(Vu),
"l" => Some(Ru),
"h" => Some(LongVowel),
"j" => Some(I),
"q" => Some(Kyu),
'a' => Some(A),
'i' => Some(I),
'u' => Some(U),
'e' => Some(E),
'o' => Some(O),
'n' => Some(N),
's' => Some(Su),
'r' => Some(Ru),
'b' => Some(Bu),
'm' => Some(Mu),
'y' => Some(Yu),
'w' => Some(Wa),
'g' => Some(Gu),
'z' => Some(Zu),
'd' => Some(De),
'p' => Some(Pu),
'k' => Some(Ku),
'f' => Some(Fu),
'c' => Some(Ku),
't' => Some(Tsu),
'v' => Some(Vu),
'l' => Some(Ru),
'h' => Some(LongVowel),
'j' => Some(I),
'q' => Some(Kyu),
_ => None,
}
}
@ -186,23 +186,24 @@ pub(crate) fn romanize(word: &str) -> Vec<Syllable> {
.chars()
.map(remove_accents)
.collect();
let mut remaining = lowercase.clone();
let mut remaining: Vec<char> = lowercase.clone().chars().collect();
let mut syllables = Vec::new();
while remaining.len() > 0 {
if !remaining.as_bytes()[0].is_ascii_alphabetic() {
syllables.push(NonAlpha(remaining.as_bytes()[0] as char));
remaining = remaining[1..].to_string();
let next = remaining[0];
if !next.is_alphabetic() {
syllables.push(NonAlpha(next));
remaining.remove(0);
continue;
}
// Check for double consonants
if remaining.len() >= 3 && remaining.as_bytes()[0] == remaining.as_bytes()[1] {
if remaining.len() >= 3 && remaining[0] == remaining[1] {
syllables.push(LittleTsu);
remaining = remaining[1..].to_string();
remaining.remove(0);
continue;
}
// Check for X
if remaining.as_bytes()[0] == b'x' {
if remaining[0] == 'x' {
syllables.push(Ku);
if remaining.len() < 2 {
// Last letter
@ -210,36 +211,42 @@ pub(crate) fn romanize(word: &str) -> Vec<Syllable> {
break;
}
// Replace X with S
remaining = ["s", &remaining[1..]].concat();
remaining[0] = 's';
}
// Check for 4 letter patterns, then 3, etc.
if remaining.len() >= 4 {
if let Some(syllable) = match4(&remaining[..4]) {
syllables.push(syllable);
remaining = remaining[4..].to_string();
remaining.remove(0);
remaining.remove(0);
remaining.remove(0);
remaining.remove(0);
continue;
}
}
if remaining.len() >= 3 {
if let Some(syllable) = match3(&remaining[..3]) {
syllables.push(syllable);
remaining = remaining[3..].to_string();
remaining.remove(0);
remaining.remove(0);
remaining.remove(0);
continue;
}
}
if remaining.len() >= 2 {
if let Some(syllable) = match2(&remaining[..2]) {
syllables.push(syllable);
remaining = remaining[2..].to_string();
remaining.remove(0);
remaining.remove(0);
continue;
}
}
if let Some(syllable) = match1(&remaining[..1]) {
if let Some(syllable) = match1(&remaining[0]) {
syllables.push(syllable);
remaining = remaining[1..].to_string();
remaining.remove(0);
continue;
}
panic!("No match found for {} ({})", remaining, word);
panic!("No match found for {} ({})", remaining.iter().collect::<String>(), word);
}
syllables
@ -345,6 +352,11 @@ mod tests {
}
}
#[test]
fn test_utf8_symbol() {
assert_eq!(romanize("Some — test"), vec![So, Me, NonAlpha(' '), NonAlpha('—'), NonAlpha(' '), Te, Su, Tsu]);
}
#[test]
fn test_dictionary() {
// Read dictionary file