fix unicode issues
This commit is contained in:
parent
e09f185bb1
commit
8e830d03cb
3 changed files with 60 additions and 48 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -235,7 +235,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "weeblib"
|
||||
version = "0.1.1"
|
||||
version = "0.1.2"
|
||||
dependencies = [
|
||||
"console_error_panic_hook",
|
||||
"wasm-bindgen",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "weeblib"
|
||||
version = "0.1.1"
|
||||
version = "0.1.2"
|
||||
edition = "2021"
|
||||
authors = ["Hamcha <hamcha@crunchy.rocks>"]
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use crate::syllable::{Syllable, Syllable::*};
|
||||
|
||||
fn match4(s: &str) -> Option<Syllable> {
|
||||
match s {
|
||||
fn match4(s: &[char]) -> Option<Syllable> {
|
||||
match s.iter().collect::<String>().as_str() {
|
||||
"scia" => Some(Sha),
|
||||
"scio" => Some(Sho),
|
||||
"sciu" => Some(Shu),
|
||||
|
@ -15,8 +15,8 @@ fn match4(s: &str) -> Option<Syllable> {
|
|||
}
|
||||
}
|
||||
|
||||
fn match3(s: &str) -> Option<Syllable> {
|
||||
match s {
|
||||
fn match3(s: &[char]) -> Option<Syllable> {
|
||||
match s.iter().collect::<String>().as_str() {
|
||||
"sci" => Some(Shi),
|
||||
"chi" => Some(Ki),
|
||||
"che" => Some(Ke),
|
||||
|
@ -48,8 +48,8 @@ fn match3(s: &str) -> Option<Syllable> {
|
|||
}
|
||||
}
|
||||
|
||||
fn match2(s: &str) -> Option<Syllable> {
|
||||
match s {
|
||||
fn match2(s: &[char]) -> Option<Syllable> {
|
||||
match s.iter().collect::<String>().as_str() {
|
||||
"ka" => Some(Ka),
|
||||
"ca" => Some(Ka),
|
||||
"ki" => Some(Ki),
|
||||
|
@ -133,33 +133,33 @@ fn match2(s: &str) -> Option<Syllable> {
|
|||
}
|
||||
}
|
||||
|
||||
fn match1(s: &str) -> Option<Syllable> {
|
||||
fn match1(s: &char) -> Option<Syllable> {
|
||||
match s {
|
||||
"a" => Some(A),
|
||||
"i" => Some(I),
|
||||
"u" => Some(U),
|
||||
"e" => Some(E),
|
||||
"o" => Some(O),
|
||||
"n" => Some(N),
|
||||
"s" => Some(Su),
|
||||
"r" => Some(Ru),
|
||||
"b" => Some(Bu),
|
||||
"m" => Some(Mu),
|
||||
"y" => Some(Yu),
|
||||
"w" => Some(Wa),
|
||||
"g" => Some(Gu),
|
||||
"z" => Some(Zu),
|
||||
"d" => Some(De),
|
||||
"p" => Some(Pu),
|
||||
"k" => Some(Ku),
|
||||
"f" => Some(Fu),
|
||||
"c" => Some(Ku),
|
||||
"t" => Some(Tsu),
|
||||
"v" => Some(Vu),
|
||||
"l" => Some(Ru),
|
||||
"h" => Some(LongVowel),
|
||||
"j" => Some(I),
|
||||
"q" => Some(Kyu),
|
||||
'a' => Some(A),
|
||||
'i' => Some(I),
|
||||
'u' => Some(U),
|
||||
'e' => Some(E),
|
||||
'o' => Some(O),
|
||||
'n' => Some(N),
|
||||
's' => Some(Su),
|
||||
'r' => Some(Ru),
|
||||
'b' => Some(Bu),
|
||||
'm' => Some(Mu),
|
||||
'y' => Some(Yu),
|
||||
'w' => Some(Wa),
|
||||
'g' => Some(Gu),
|
||||
'z' => Some(Zu),
|
||||
'd' => Some(De),
|
||||
'p' => Some(Pu),
|
||||
'k' => Some(Ku),
|
||||
'f' => Some(Fu),
|
||||
'c' => Some(Ku),
|
||||
't' => Some(Tsu),
|
||||
'v' => Some(Vu),
|
||||
'l' => Some(Ru),
|
||||
'h' => Some(LongVowel),
|
||||
'j' => Some(I),
|
||||
'q' => Some(Kyu),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
@ -186,23 +186,24 @@ pub(crate) fn romanize(word: &str) -> Vec<Syllable> {
|
|||
.chars()
|
||||
.map(remove_accents)
|
||||
.collect();
|
||||
let mut remaining = lowercase.clone();
|
||||
let mut remaining: Vec<char> = lowercase.clone().chars().collect();
|
||||
let mut syllables = Vec::new();
|
||||
|
||||
while remaining.len() > 0 {
|
||||
if !remaining.as_bytes()[0].is_ascii_alphabetic() {
|
||||
syllables.push(NonAlpha(remaining.as_bytes()[0] as char));
|
||||
remaining = remaining[1..].to_string();
|
||||
let next = remaining[0];
|
||||
if !next.is_alphabetic() {
|
||||
syllables.push(NonAlpha(next));
|
||||
remaining.remove(0);
|
||||
continue;
|
||||
}
|
||||
// Check for double consonants
|
||||
if remaining.len() >= 3 && remaining.as_bytes()[0] == remaining.as_bytes()[1] {
|
||||
if remaining.len() >= 3 && remaining[0] == remaining[1] {
|
||||
syllables.push(LittleTsu);
|
||||
remaining = remaining[1..].to_string();
|
||||
remaining.remove(0);
|
||||
continue;
|
||||
}
|
||||
// Check for X
|
||||
if remaining.as_bytes()[0] == b'x' {
|
||||
if remaining[0] == 'x' {
|
||||
syllables.push(Ku);
|
||||
if remaining.len() < 2 {
|
||||
// Last letter
|
||||
|
@ -210,36 +211,42 @@ pub(crate) fn romanize(word: &str) -> Vec<Syllable> {
|
|||
break;
|
||||
}
|
||||
// Replace X with S
|
||||
remaining = ["s", &remaining[1..]].concat();
|
||||
remaining[0] = 's';
|
||||
}
|
||||
// Check for 4 letter patterns, then 3, etc.
|
||||
if remaining.len() >= 4 {
|
||||
if let Some(syllable) = match4(&remaining[..4]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[4..].to_string();
|
||||
remaining.remove(0);
|
||||
remaining.remove(0);
|
||||
remaining.remove(0);
|
||||
remaining.remove(0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if remaining.len() >= 3 {
|
||||
if let Some(syllable) = match3(&remaining[..3]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[3..].to_string();
|
||||
remaining.remove(0);
|
||||
remaining.remove(0);
|
||||
remaining.remove(0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if remaining.len() >= 2 {
|
||||
if let Some(syllable) = match2(&remaining[..2]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[2..].to_string();
|
||||
remaining.remove(0);
|
||||
remaining.remove(0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if let Some(syllable) = match1(&remaining[..1]) {
|
||||
if let Some(syllable) = match1(&remaining[0]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[1..].to_string();
|
||||
remaining.remove(0);
|
||||
continue;
|
||||
}
|
||||
panic!("No match found for {} ({})", remaining, word);
|
||||
panic!("No match found for {} ({})", remaining.iter().collect::<String>(), word);
|
||||
}
|
||||
|
||||
syllables
|
||||
|
@ -345,6 +352,11 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_symbol() {
|
||||
assert_eq!(romanize("Some — test"), vec![So, Me, NonAlpha(' '), NonAlpha('—'), NonAlpha(' '), Te, Su, Tsu]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dictionary() {
|
||||
// Read dictionary file
|
||||
|
|
Loading…
Reference in a new issue