romanization works!

This commit is contained in:
Hamcha 2024-10-24 23:07:09 +02:00
commit 3d5a7cc63f
Signed by: hamcha
GPG key ID: 1669C533B8CF6D89
8 changed files with 191014 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
.idea

7
Cargo.lock generated Normal file
View file

@ -0,0 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "weeblib"
version = "0.1.0"

8
Cargo.toml Normal file
View file

@ -0,0 +1,8 @@
[package]
name = "weeblib"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

17
src/lib.rs Normal file
View file

@ -0,0 +1,17 @@
mod romanization;
mod syllable;
pub fn add(left: usize, right: usize) -> usize {
left + right
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
}

353
src/romanization.rs Normal file
View file

@ -0,0 +1,353 @@
use crate::syllable::{Syllable, Syllable::*};
fn match4(s: &str) -> Option<Syllable> {
match s {
"scia" => Some(Sha),
"scio" => Some(Sho),
"sciu" => Some(Shu),
"chia" => Some(Kya),
"chio" => Some(Kyo),
"chiu" => Some(Kyu),
"ghia" => Some(Gya),
"ghio" => Some(Gyo),
"ghiu" => Some(Gyu),
_ => None,
}
}
fn match3(s: &str) -> Option<Syllable> {
match s {
"sci" => Some(Shi),
"chi" => Some(Ki),
"che" => Some(Ke),
"tsu" => Some(Tsu),
"ghi" => Some(Gi),
"ghe" => Some(Ge),
"ria" => Some(Rya),
"rio" => Some(Ryo),
"riu" => Some(Ryu),
"cia" => Some(Cha),
"cio" => Some(Cho),
"ciu" => Some(Chu),
"pia" => Some(Pya),
"pio" => Some(Pyo),
"piu" => Some(Pyu),
"bia" => Some(Bya),
"bio" => Some(Byo),
"biu" => Some(Byu),
"gia" => Some(Ja),
"giu" => Some(Ju),
"gio" => Some(Jo),
"zia" => Some(Zya),
"zie" => Some(Zye),
"zio" => Some(Zyo),
"ziu" => Some(Zyu),
_ => None,
}
}
fn match2(s: &str) -> Option<Syllable> {
match s {
"ka" => Some(Ka),
"ca" => Some(Ka),
"ki" => Some(Ki),
"ku" => Some(Ku),
"cu" => Some(Ku),
"ke" => Some(Ke),
"ko" => Some(Ko),
"co" => Some(Ko),
"sa" => Some(Sa),
"su" => Some(Su),
"se" => Some(Se),
"so" => Some(So),
"ta" => Some(Ta),
"ci" => Some(Chi),
"te" => Some(Te),
"to" => Some(To),
"na" => Some(Na),
"ni" => Some(Ni),
"nu" => Some(Nu),
"ne" => Some(Ne),
"no" => Some(No),
"ha" => Some(Ha),
"hi" => Some(Hi),
"fu" => Some(Fu),
"he" => Some(He),
"ho" => Some(Ho),
"ma" => Some(Ma),
"mi" => Some(Mi),
"mu" => Some(Mu),
"me" => Some(Me),
"mo" => Some(Mo),
"ya" => Some(Ya),
"yu" => Some(Yu),
"yo" => Some(Yo),
"ra" => Some(Ra),
"ri" => Some(Ri),
"ru" => Some(Ru),
"re" => Some(Re),
"ro" => Some(Ro),
"wa" => Some(Wa),
"ga" => Some(Ga),
"gi" => Some(Ji),
"gu" => Some(Gu),
"go" => Some(Go),
"za" => Some(Za),
"zu" => Some(Zu),
"ze" => Some(Ze),
"zo" => Some(Zo),
"da" => Some(Da),
"de" => Some(De),
"do" => Some(Do),
"ba" => Some(Ba),
"bi" => Some(Bi),
"bu" => Some(Bu),
"be" => Some(Be),
"bo" => Some(Bo),
"pa" => Some(Pa),
"pi" => Some(Pi),
"pu" => Some(Pu),
"pe" => Some(Pe),
"po" => Some(Po),
"ce" => Some(ChiE),
"la" => Some(Ra),
"li" => Some(Ri),
"lu" => Some(Ru),
"le" => Some(Re),
"lo" => Some(Ro),
"si" => Some(Shi),
"ti" => Some(TeI),
"tu" => Some(ToU),
"va" => Some(VuA),
"ve" => Some(VuE),
"vi" => Some(VuI),
"vu" => Some(Vu),
"vo" => Some(VuO),
"qu" => Some(Ku),
"du" => Some(Zu),
"ge" => Some(Je),
"zi" => Some(Zi),
_ => None,
}
}
fn match1(s: &str) -> Option<Syllable> {
match s {
"a" => Some(A),
"i" => Some(I),
"u" => Some(U),
"e" => Some(E),
"o" => Some(O),
"n" => Some(N),
"s" => Some(Su),
"r" => Some(Ru),
"b" => Some(Bu),
"m" => Some(Mu),
"y" => Some(Yu),
"w" => Some(Wa),
"g" => Some(Gu),
"z" => Some(Zu),
"d" => Some(De),
"p" => Some(Pu),
"k" => Some(Ku),
"f" => Some(Fu),
"c" => Some(Ku),
"t" => Some(Tsu),
"v" => Some(Vu),
"l" => Some(Ru),
"h" => Some(LongVowel),
"j" => Some(I),
"q" => Some(Kyu),
_ => None,
}
}
fn remove_accents(c: char) -> char {
match c {
'à' => 'a',
'è' => 'e',
'é' => 'e',
'ì' => 'i',
'ò' => 'o',
'ó' => 'o',
'ù' => 'u',
'â' => 'a',
'ô' => 'o',
'ç' => 'c',
_ => c,
}
}
pub fn romanize(word: &str) -> Vec<Syllable> {
let lowercase: String = word
.to_lowercase()
.chars()
.map(remove_accents)
.filter(|c| c.is_alphabetic())
.collect();
let mut remaining = lowercase.clone();
let mut syllables = Vec::new();
while remaining.len() > 0 {
// Check for double consonants
if remaining.len() >= 3 && remaining.as_bytes()[0] == remaining.as_bytes()[1] {
syllables.push(LittleTsu);
remaining = remaining[1..].to_string();
continue;
}
// Check for X
if remaining.as_bytes()[0] == b'x' {
syllables.push(Ku);
if remaining.len() < 2 {
// Last letter
syllables.push(Su);
break;
}
// Replace X with S
remaining = ["s", &remaining[1..]].concat();
}
// Check for 4 letter patterns, then 3, etc.
if remaining.len() >= 4 {
if let Some(syllable) = match4(&remaining[..4]) {
syllables.push(syllable);
remaining = remaining[4..].to_string();
continue;
}
}
if remaining.len() >= 3 {
if let Some(syllable) = match3(&remaining[..3]) {
syllables.push(syllable);
remaining = remaining[3..].to_string();
continue;
}
}
if remaining.len() >= 2 {
if let Some(syllable) = match2(&remaining[..2]) {
syllables.push(syllable);
remaining = remaining[2..].to_string();
continue;
}
}
if let Some(syllable) = match1(&remaining[..1]) {
syllables.push(syllable);
remaining = remaining[1..].to_string();
continue;
}
panic!("No match found for {} ({})", remaining, word);
}
syllables
}
#[cfg(test)]
mod tests {
use super::*;
//use crate::syllable::to_hiragana;
use std::path::Path;
#[test]
fn simple_words() {
// Words and their expected representations
let simple_words = [
("zanzara", vec![Za, N, Za, Ra]),
("zero", vec![Ze, Ro]),
("asta", vec![A, Su, Ta]),
("storia", vec![Su, To, Rya]),
("scienza", vec![Shi, E, N, Za]),
("sentai", vec![Se, N, Ta, I]),
];
for (word, expected) in simple_words {
assert_eq!(romanize(word)[..], expected);
}
}
#[test]
fn complex_words() {
let complex_words = [
("arpione", vec![A, Ru, Pyo, Ne]),
("pianura", vec![Pya, Nu, Ra]),
("acciderbolina", vec![A, LittleTsu, Chi, De, Ru, Bo, Ri, Na]),
("asciugare", vec![A, Shu, Ga, Re]),
("chiuso", vec![Kyu, So]),
("alex", vec![A, Re, Ku, Su]),
("pierluigi", vec![Pi, E, Ru, Ru, I, Ji]),
("duecento", vec![Zu, E, ChiE, N, To]),
("refrigeratore", vec![Re, Fu, Ri, Je, Ra, To, Re]),
];
for (word, expected) in complex_words {
assert_eq!(romanize(word)[..], expected);
}
}
#[test]
fn mixed_case() {
// Words and their expected representations
let mixed_case = [
("Zanzara", vec![Za, N, Za, Ra]),
("ScIeNza", vec![Shi, E, N, Za]),
];
for (word, expected) in mixed_case {
assert_eq!(romanize(word)[..], expected);
}
}
#[test]
fn double_consonants() {
// Words and their expected representations
let double_consonants = [
("asso", vec![A, LittleTsu, So]),
("porcellana", vec![Po, Ru, ChiE, LittleTsu, Ra, Na]),
("ruggine", vec![Ru, LittleTsu, Ji, Ne]),
("accidenti", vec![A, LittleTsu, Chi, De, N, TeI]),
];
for (word, expected) in double_consonants {
assert_eq!(romanize(word)[..], expected);
}
}
#[test]
fn accented_letters() {
let accented_words = [
("così", vec![Ko, Shi]),
("perché", vec![Pe, Ru, Ke]),
("", vec![Ra]),
("capirà", vec![Ka, Pi, Ra]),
];
for (word, expected) in accented_words {
assert_eq!(romanize(word)[..], expected);
}
}
#[test]
fn test_dictionary() {
// Read dictionary file
let test_path = Path::new(file!())
.parent()
.unwrap()
.join("./test_assets/parolone.txt");
let dictionary = std::fs::read_to_string(test_path).unwrap();
let words = dictionary.lines();
for word in words {
romanize(word.trim());
}
/*
let mut str = "".to_string();
for word in words {
let romanized = romanize(word.trim());
let formatted = format!("{} -> {}\n", word, romanized.iter().map(to_hiragana).collect::<Vec<String>>().join(""));
str.push_str(&formatted);
}
// Write to file
let output_path = Path::new(file!())
.parent()
.unwrap()
.join("./test_assets/romanized.txt");
std::fs::write(output_path, str).unwrap();
*/
}
}

241
src/syllable.rs Normal file
View file

@ -0,0 +1,241 @@
use crate::syllable::Syllable::*;
use std::fmt::Display;
#[derive(PartialEq, Eq, Debug)]
pub enum Syllable {
A,
I,
U,
E,
O,
Ka,
Ki,
Ku,
Ke,
Ko,
Sa,
Shi,
Su,
Se,
So,
Ta,
Chi,
Tsu,
Te,
To,
Na,
Ni,
Nu,
Ne,
No,
Ha,
Hi,
Fu,
He,
Ho,
Ma,
Mi,
Mu,
Me,
Mo,
Ya,
Yu,
Yo,
Ra,
Ri,
Ru,
Re,
Ro,
Wa,
N,
Ga,
Gi,
Gu,
Ge,
Go,
Za,
Ji,
Zu,
Ze,
Zo,
Da,
De,
Do,
Ba,
Bi,
Bu,
Be,
Bo,
Pa,
Pi,
Pu,
Pe,
Po,
// Standard Digraphs
Kya,
Kyu,
Kyo,
Sha,
Shu,
Sho,
Cha,
Chu,
Cho,
Ja,
Ju,
Jo,
Gya,
Gyu,
Gyo,
Bya,
Byu,
Byo,
Pya,
Pyu,
Pyo,
//Nya, Nyu, Nyo,
//Mya, Myu, Myo,
Rya,
Ryu,
Ryo,
// Modifiers
LittleTsu,
LongVowel,
// Mixed
ChiE,
ToU,
TeI,
Je,
Zi,
// Vu Mixゔ
VuA,
VuI,
Vu,
VuE,
VuO,
// Full degenerate italian
Zyo,
Zya,
Zyu,
Zye,
}
pub(crate) fn to_hiragana(syllable: &Syllable) -> String {
match syllable {
A => "",
I => "",
U => "",
E => "",
O => "",
Ka => "",
Ki => "",
Ku => "",
Ke => "",
Ko => "",
Sa => "",
Shi => "",
Su => "",
Se => "",
So => "",
Ta => "",
Chi => "",
Tsu => "",
Te => "",
To => "",
Na => "",
Ni => "",
Nu => "",
Ne => "",
No => "",
Ha => "",
Hi => "",
Fu => "",
He => "",
Ho => "",
Ma => "",
Mi => "",
Mu => "",
Me => "",
Mo => "",
Ya => "",
Yu => "",
Yo => "",
Ra => "",
Ri => "",
Ru => "",
Re => "",
Ro => "",
Wa => "",
N => "",
Ga => "",
Gi => "",
Gu => "",
Ge => "",
Go => "",
Za => "",
Ji => "",
Zu => "",
Ze => "",
Zo => "",
Da => "",
De => "",
Do => "",
Ba => "",
Bi => "",
Bu => "",
Be => "",
Bo => "",
Pa => "",
Pi => "",
Pu => "",
Pe => "",
Po => "",
Kya => "きゃ",
Kyu => "きゅ",
Kyo => "きょ",
Sha => "しゃ",
Shu => "しゅ",
Sho => "しょ",
Cha => "ちゃ",
Chu => "ちゅ",
Cho => "ちょ",
Rya => "りゃ",
Ryu => "りゅ",
Ryo => "りょ",
LittleTsu => "",
LongVowel => "",
ChiE => "ちぇ",
ToU => "とぅ",
TeI => "とぃ",
VuA => "ゔぁ",
VuI => "ゔぃ",
Vu => "",
VuE => "ゔぇ",
VuO => "ゔぉ",
Ja => "じゃ",
Ju => "じゅ",
Jo => "じょ",
Je => "じぇ",
Zi => "ずぃ",
Gya => "ぎゃ",
Gyu => "ぎゅ",
Gyo => "ぎょ",
Bya => "びゃ",
Byu => "びゅ",
Byo => "びょ",
Pya => "ぴゃ",
Pyu => "ぴゅ",
Pyo => "ぴょ",
Zyo => "ずょ",
Zya => "ずゃ",
Zyu => "ずゅ",
Zye => "ずぇ",
}.to_string()
}
impl Display for Syllable {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", to_hiragana(self))
}
}

95193
src/test_assets/parolone.txt Normal file

File diff suppressed because it is too large Load diff

95193
src/test_assets/romanized.txt Normal file

File diff suppressed because it is too large Load diff