romanization works!
This commit is contained in:
commit
3d5a7cc63f
8 changed files with 191014 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
/target
|
||||
.idea
|
7
Cargo.lock
generated
Normal file
7
Cargo.lock
generated
Normal file
|
@ -0,0 +1,7 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "weeblib"
|
||||
version = "0.1.0"
|
8
Cargo.toml
Normal file
8
Cargo.toml
Normal file
|
@ -0,0 +1,8 @@
|
|||
[package]
|
||||
name = "weeblib"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
17
src/lib.rs
Normal file
17
src/lib.rs
Normal file
|
@ -0,0 +1,17 @@
|
|||
mod romanization;
|
||||
mod syllable;
|
||||
|
||||
pub fn add(left: usize, right: usize) -> usize {
|
||||
left + right
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_works() {
|
||||
let result = add(2, 2);
|
||||
assert_eq!(result, 4);
|
||||
}
|
||||
}
|
353
src/romanization.rs
Normal file
353
src/romanization.rs
Normal file
|
@ -0,0 +1,353 @@
|
|||
use crate::syllable::{Syllable, Syllable::*};
|
||||
|
||||
fn match4(s: &str) -> Option<Syllable> {
|
||||
match s {
|
||||
"scia" => Some(Sha),
|
||||
"scio" => Some(Sho),
|
||||
"sciu" => Some(Shu),
|
||||
"chia" => Some(Kya),
|
||||
"chio" => Some(Kyo),
|
||||
"chiu" => Some(Kyu),
|
||||
"ghia" => Some(Gya),
|
||||
"ghio" => Some(Gyo),
|
||||
"ghiu" => Some(Gyu),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn match3(s: &str) -> Option<Syllable> {
|
||||
match s {
|
||||
"sci" => Some(Shi),
|
||||
"chi" => Some(Ki),
|
||||
"che" => Some(Ke),
|
||||
"tsu" => Some(Tsu),
|
||||
"ghi" => Some(Gi),
|
||||
"ghe" => Some(Ge),
|
||||
"ria" => Some(Rya),
|
||||
"rio" => Some(Ryo),
|
||||
"riu" => Some(Ryu),
|
||||
"cia" => Some(Cha),
|
||||
"cio" => Some(Cho),
|
||||
"ciu" => Some(Chu),
|
||||
"pia" => Some(Pya),
|
||||
"pio" => Some(Pyo),
|
||||
"piu" => Some(Pyu),
|
||||
"bia" => Some(Bya),
|
||||
"bio" => Some(Byo),
|
||||
"biu" => Some(Byu),
|
||||
"gia" => Some(Ja),
|
||||
"giu" => Some(Ju),
|
||||
"gio" => Some(Jo),
|
||||
"zia" => Some(Zya),
|
||||
"zie" => Some(Zye),
|
||||
"zio" => Some(Zyo),
|
||||
"ziu" => Some(Zyu),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn match2(s: &str) -> Option<Syllable> {
|
||||
match s {
|
||||
"ka" => Some(Ka),
|
||||
"ca" => Some(Ka),
|
||||
"ki" => Some(Ki),
|
||||
"ku" => Some(Ku),
|
||||
"cu" => Some(Ku),
|
||||
"ke" => Some(Ke),
|
||||
"ko" => Some(Ko),
|
||||
"co" => Some(Ko),
|
||||
"sa" => Some(Sa),
|
||||
"su" => Some(Su),
|
||||
"se" => Some(Se),
|
||||
"so" => Some(So),
|
||||
"ta" => Some(Ta),
|
||||
"ci" => Some(Chi),
|
||||
"te" => Some(Te),
|
||||
"to" => Some(To),
|
||||
"na" => Some(Na),
|
||||
"ni" => Some(Ni),
|
||||
"nu" => Some(Nu),
|
||||
"ne" => Some(Ne),
|
||||
"no" => Some(No),
|
||||
"ha" => Some(Ha),
|
||||
"hi" => Some(Hi),
|
||||
"fu" => Some(Fu),
|
||||
"he" => Some(He),
|
||||
"ho" => Some(Ho),
|
||||
"ma" => Some(Ma),
|
||||
"mi" => Some(Mi),
|
||||
"mu" => Some(Mu),
|
||||
"me" => Some(Me),
|
||||
"mo" => Some(Mo),
|
||||
"ya" => Some(Ya),
|
||||
"yu" => Some(Yu),
|
||||
"yo" => Some(Yo),
|
||||
"ra" => Some(Ra),
|
||||
"ri" => Some(Ri),
|
||||
"ru" => Some(Ru),
|
||||
"re" => Some(Re),
|
||||
"ro" => Some(Ro),
|
||||
"wa" => Some(Wa),
|
||||
"ga" => Some(Ga),
|
||||
"gi" => Some(Ji),
|
||||
"gu" => Some(Gu),
|
||||
"go" => Some(Go),
|
||||
"za" => Some(Za),
|
||||
"zu" => Some(Zu),
|
||||
"ze" => Some(Ze),
|
||||
"zo" => Some(Zo),
|
||||
"da" => Some(Da),
|
||||
"de" => Some(De),
|
||||
"do" => Some(Do),
|
||||
"ba" => Some(Ba),
|
||||
"bi" => Some(Bi),
|
||||
"bu" => Some(Bu),
|
||||
"be" => Some(Be),
|
||||
"bo" => Some(Bo),
|
||||
"pa" => Some(Pa),
|
||||
"pi" => Some(Pi),
|
||||
"pu" => Some(Pu),
|
||||
"pe" => Some(Pe),
|
||||
"po" => Some(Po),
|
||||
"ce" => Some(ChiE),
|
||||
"la" => Some(Ra),
|
||||
"li" => Some(Ri),
|
||||
"lu" => Some(Ru),
|
||||
"le" => Some(Re),
|
||||
"lo" => Some(Ro),
|
||||
"si" => Some(Shi),
|
||||
"ti" => Some(TeI),
|
||||
"tu" => Some(ToU),
|
||||
"va" => Some(VuA),
|
||||
"ve" => Some(VuE),
|
||||
"vi" => Some(VuI),
|
||||
"vu" => Some(Vu),
|
||||
"vo" => Some(VuO),
|
||||
"qu" => Some(Ku),
|
||||
"du" => Some(Zu),
|
||||
"ge" => Some(Je),
|
||||
"zi" => Some(Zi),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn match1(s: &str) -> Option<Syllable> {
|
||||
match s {
|
||||
"a" => Some(A),
|
||||
"i" => Some(I),
|
||||
"u" => Some(U),
|
||||
"e" => Some(E),
|
||||
"o" => Some(O),
|
||||
"n" => Some(N),
|
||||
"s" => Some(Su),
|
||||
"r" => Some(Ru),
|
||||
"b" => Some(Bu),
|
||||
"m" => Some(Mu),
|
||||
"y" => Some(Yu),
|
||||
"w" => Some(Wa),
|
||||
"g" => Some(Gu),
|
||||
"z" => Some(Zu),
|
||||
"d" => Some(De),
|
||||
"p" => Some(Pu),
|
||||
"k" => Some(Ku),
|
||||
"f" => Some(Fu),
|
||||
"c" => Some(Ku),
|
||||
"t" => Some(Tsu),
|
||||
"v" => Some(Vu),
|
||||
"l" => Some(Ru),
|
||||
"h" => Some(LongVowel),
|
||||
"j" => Some(I),
|
||||
"q" => Some(Kyu),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_accents(c: char) -> char {
|
||||
match c {
|
||||
'à' => 'a',
|
||||
'è' => 'e',
|
||||
'é' => 'e',
|
||||
'ì' => 'i',
|
||||
'ò' => 'o',
|
||||
'ó' => 'o',
|
||||
'ù' => 'u',
|
||||
'â' => 'a',
|
||||
'ô' => 'o',
|
||||
'ç' => 'c',
|
||||
_ => c,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn romanize(word: &str) -> Vec<Syllable> {
|
||||
let lowercase: String = word
|
||||
.to_lowercase()
|
||||
.chars()
|
||||
.map(remove_accents)
|
||||
.filter(|c| c.is_alphabetic())
|
||||
.collect();
|
||||
let mut remaining = lowercase.clone();
|
||||
let mut syllables = Vec::new();
|
||||
|
||||
while remaining.len() > 0 {
|
||||
// Check for double consonants
|
||||
if remaining.len() >= 3 && remaining.as_bytes()[0] == remaining.as_bytes()[1] {
|
||||
syllables.push(LittleTsu);
|
||||
remaining = remaining[1..].to_string();
|
||||
continue;
|
||||
}
|
||||
// Check for X
|
||||
if remaining.as_bytes()[0] == b'x' {
|
||||
syllables.push(Ku);
|
||||
if remaining.len() < 2 {
|
||||
// Last letter
|
||||
syllables.push(Su);
|
||||
break;
|
||||
}
|
||||
// Replace X with S
|
||||
remaining = ["s", &remaining[1..]].concat();
|
||||
}
|
||||
// Check for 4 letter patterns, then 3, etc.
|
||||
if remaining.len() >= 4 {
|
||||
if let Some(syllable) = match4(&remaining[..4]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[4..].to_string();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if remaining.len() >= 3 {
|
||||
if let Some(syllable) = match3(&remaining[..3]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[3..].to_string();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if remaining.len() >= 2 {
|
||||
if let Some(syllable) = match2(&remaining[..2]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[2..].to_string();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if let Some(syllable) = match1(&remaining[..1]) {
|
||||
syllables.push(syllable);
|
||||
remaining = remaining[1..].to_string();
|
||||
continue;
|
||||
}
|
||||
panic!("No match found for {} ({})", remaining, word);
|
||||
}
|
||||
|
||||
syllables
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
//use crate::syllable::to_hiragana;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn simple_words() {
|
||||
// Words and their expected representations
|
||||
let simple_words = [
|
||||
("zanzara", vec![Za, N, Za, Ra]),
|
||||
("zero", vec![Ze, Ro]),
|
||||
("asta", vec![A, Su, Ta]),
|
||||
("storia", vec![Su, To, Rya]),
|
||||
("scienza", vec![Shi, E, N, Za]),
|
||||
("sentai", vec![Se, N, Ta, I]),
|
||||
];
|
||||
for (word, expected) in simple_words {
|
||||
assert_eq!(romanize(word)[..], expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn complex_words() {
|
||||
let complex_words = [
|
||||
("arpione", vec![A, Ru, Pyo, Ne]),
|
||||
("pianura", vec![Pya, Nu, Ra]),
|
||||
("acciderbolina", vec![A, LittleTsu, Chi, De, Ru, Bo, Ri, Na]),
|
||||
("asciugare", vec![A, Shu, Ga, Re]),
|
||||
("chiuso", vec![Kyu, So]),
|
||||
("alex", vec![A, Re, Ku, Su]),
|
||||
("pierluigi", vec![Pi, E, Ru, Ru, I, Ji]),
|
||||
("duecento", vec![Zu, E, ChiE, N, To]),
|
||||
("refrigeratore", vec![Re, Fu, Ri, Je, Ra, To, Re]),
|
||||
];
|
||||
for (word, expected) in complex_words {
|
||||
assert_eq!(romanize(word)[..], expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_case() {
|
||||
// Words and their expected representations
|
||||
let mixed_case = [
|
||||
("Zanzara", vec![Za, N, Za, Ra]),
|
||||
("ScIeNza", vec![Shi, E, N, Za]),
|
||||
];
|
||||
for (word, expected) in mixed_case {
|
||||
assert_eq!(romanize(word)[..], expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn double_consonants() {
|
||||
// Words and their expected representations
|
||||
let double_consonants = [
|
||||
("asso", vec![A, LittleTsu, So]),
|
||||
("porcellana", vec![Po, Ru, ChiE, LittleTsu, Ra, Na]),
|
||||
("ruggine", vec![Ru, LittleTsu, Ji, Ne]),
|
||||
("accidenti", vec![A, LittleTsu, Chi, De, N, TeI]),
|
||||
];
|
||||
for (word, expected) in double_consonants {
|
||||
assert_eq!(romanize(word)[..], expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn accented_letters() {
|
||||
let accented_words = [
|
||||
("così", vec![Ko, Shi]),
|
||||
("perché", vec![Pe, Ru, Ke]),
|
||||
("là", vec![Ra]),
|
||||
("capirà", vec![Ka, Pi, Ra]),
|
||||
];
|
||||
|
||||
for (word, expected) in accented_words {
|
||||
assert_eq!(romanize(word)[..], expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dictionary() {
|
||||
// Read dictionary file
|
||||
let test_path = Path::new(file!())
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("./test_assets/parolone.txt");
|
||||
let dictionary = std::fs::read_to_string(test_path).unwrap();
|
||||
let words = dictionary.lines();
|
||||
|
||||
for word in words {
|
||||
romanize(word.trim());
|
||||
}
|
||||
/*
|
||||
let mut str = "".to_string();
|
||||
for word in words {
|
||||
let romanized = romanize(word.trim());
|
||||
let formatted = format!("{} -> {}\n", word, romanized.iter().map(to_hiragana).collect::<Vec<String>>().join(""));
|
||||
str.push_str(&formatted);
|
||||
}
|
||||
|
||||
|
||||
// Write to file
|
||||
let output_path = Path::new(file!())
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("./test_assets/romanized.txt");
|
||||
std::fs::write(output_path, str).unwrap();
|
||||
|
||||
*/
|
||||
}
|
||||
}
|
241
src/syllable.rs
Normal file
241
src/syllable.rs
Normal file
|
@ -0,0 +1,241 @@
|
|||
use crate::syllable::Syllable::*;
|
||||
use std::fmt::Display;
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum Syllable {
|
||||
A,
|
||||
I,
|
||||
U,
|
||||
E,
|
||||
O,
|
||||
Ka,
|
||||
Ki,
|
||||
Ku,
|
||||
Ke,
|
||||
Ko,
|
||||
Sa,
|
||||
Shi,
|
||||
Su,
|
||||
Se,
|
||||
So,
|
||||
Ta,
|
||||
Chi,
|
||||
Tsu,
|
||||
Te,
|
||||
To,
|
||||
Na,
|
||||
Ni,
|
||||
Nu,
|
||||
Ne,
|
||||
No,
|
||||
Ha,
|
||||
Hi,
|
||||
Fu,
|
||||
He,
|
||||
Ho,
|
||||
Ma,
|
||||
Mi,
|
||||
Mu,
|
||||
Me,
|
||||
Mo,
|
||||
Ya,
|
||||
Yu,
|
||||
Yo,
|
||||
Ra,
|
||||
Ri,
|
||||
Ru,
|
||||
Re,
|
||||
Ro,
|
||||
Wa,
|
||||
N,
|
||||
Ga,
|
||||
Gi,
|
||||
Gu,
|
||||
Ge,
|
||||
Go,
|
||||
Za,
|
||||
Ji,
|
||||
Zu,
|
||||
Ze,
|
||||
Zo,
|
||||
Da,
|
||||
De,
|
||||
Do,
|
||||
Ba,
|
||||
Bi,
|
||||
Bu,
|
||||
Be,
|
||||
Bo,
|
||||
Pa,
|
||||
Pi,
|
||||
Pu,
|
||||
Pe,
|
||||
Po,
|
||||
// Standard Digraphs
|
||||
Kya,
|
||||
Kyu,
|
||||
Kyo,
|
||||
Sha,
|
||||
Shu,
|
||||
Sho,
|
||||
Cha,
|
||||
Chu,
|
||||
Cho,
|
||||
Ja,
|
||||
Ju,
|
||||
Jo,
|
||||
Gya,
|
||||
Gyu,
|
||||
Gyo,
|
||||
Bya,
|
||||
Byu,
|
||||
Byo,
|
||||
Pya,
|
||||
Pyu,
|
||||
Pyo,
|
||||
//Nya, Nyu, Nyo,
|
||||
//Mya, Myu, Myo,
|
||||
Rya,
|
||||
Ryu,
|
||||
Ryo,
|
||||
// Modifiers
|
||||
LittleTsu,
|
||||
LongVowel,
|
||||
// Mixed
|
||||
ChiE,
|
||||
ToU,
|
||||
TeI,
|
||||
Je,
|
||||
Zi,
|
||||
// Vu Mixゔ
|
||||
VuA,
|
||||
VuI,
|
||||
Vu,
|
||||
VuE,
|
||||
VuO,
|
||||
// Full degenerate italian
|
||||
Zyo,
|
||||
Zya,
|
||||
Zyu,
|
||||
Zye,
|
||||
}
|
||||
|
||||
|
||||
pub(crate) fn to_hiragana(syllable: &Syllable) -> String {
|
||||
match syllable {
|
||||
A => "あ",
|
||||
I => "い",
|
||||
U => "う",
|
||||
E => "え",
|
||||
O => "お",
|
||||
Ka => "か",
|
||||
Ki => "き",
|
||||
Ku => "く",
|
||||
Ke => "け",
|
||||
Ko => "こ",
|
||||
Sa => "さ",
|
||||
Shi => "し",
|
||||
Su => "す",
|
||||
Se => "せ",
|
||||
So => "そ",
|
||||
Ta => "た",
|
||||
Chi => "ち",
|
||||
Tsu => "つ",
|
||||
Te => "て",
|
||||
To => "と",
|
||||
Na => "な",
|
||||
Ni => "に",
|
||||
Nu => "ぬ",
|
||||
Ne => "ね",
|
||||
No => "の",
|
||||
Ha => "は",
|
||||
Hi => "ひ",
|
||||
Fu => "ふ",
|
||||
He => "へ",
|
||||
Ho => "ほ",
|
||||
Ma => "ま",
|
||||
Mi => "み",
|
||||
Mu => "む",
|
||||
Me => "め",
|
||||
Mo => "も",
|
||||
Ya => "や",
|
||||
Yu => "ゆ",
|
||||
Yo => "よ",
|
||||
Ra => "ら",
|
||||
Ri => "り",
|
||||
Ru => "る",
|
||||
Re => "れ",
|
||||
Ro => "ろ",
|
||||
Wa => "わ",
|
||||
N => "ん",
|
||||
Ga => "が",
|
||||
Gi => "ぎ",
|
||||
Gu => "ぐ",
|
||||
Ge => "げ",
|
||||
Go => "ご",
|
||||
Za => "ざ",
|
||||
Ji => "じ",
|
||||
Zu => "ず",
|
||||
Ze => "ぜ",
|
||||
Zo => "ぞ",
|
||||
Da => "だ",
|
||||
De => "で",
|
||||
Do => "ど",
|
||||
Ba => "ば",
|
||||
Bi => "び",
|
||||
Bu => "ぶ",
|
||||
Be => "べ",
|
||||
Bo => "ぼ",
|
||||
Pa => "ぱ",
|
||||
Pi => "ぴ",
|
||||
Pu => "ぷ",
|
||||
Pe => "ぺ",
|
||||
Po => "ぽ",
|
||||
Kya => "きゃ",
|
||||
Kyu => "きゅ",
|
||||
Kyo => "きょ",
|
||||
Sha => "しゃ",
|
||||
Shu => "しゅ",
|
||||
Sho => "しょ",
|
||||
Cha => "ちゃ",
|
||||
Chu => "ちゅ",
|
||||
Cho => "ちょ",
|
||||
Rya => "りゃ",
|
||||
Ryu => "りゅ",
|
||||
Ryo => "りょ",
|
||||
LittleTsu => "っ",
|
||||
LongVowel => "ー",
|
||||
ChiE => "ちぇ",
|
||||
ToU => "とぅ",
|
||||
TeI => "とぃ",
|
||||
VuA => "ゔぁ",
|
||||
VuI => "ゔぃ",
|
||||
Vu => "ゔ",
|
||||
VuE => "ゔぇ",
|
||||
VuO => "ゔぉ",
|
||||
Ja => "じゃ",
|
||||
Ju => "じゅ",
|
||||
Jo => "じょ",
|
||||
Je => "じぇ",
|
||||
Zi => "ずぃ",
|
||||
Gya => "ぎゃ",
|
||||
Gyu => "ぎゅ",
|
||||
Gyo => "ぎょ",
|
||||
Bya => "びゃ",
|
||||
Byu => "びゅ",
|
||||
Byo => "びょ",
|
||||
Pya => "ぴゃ",
|
||||
Pyu => "ぴゅ",
|
||||
Pyo => "ぴょ",
|
||||
Zyo => "ずょ",
|
||||
Zya => "ずゃ",
|
||||
Zyu => "ずゅ",
|
||||
Zye => "ずぇ",
|
||||
}.to_string()
|
||||
}
|
||||
|
||||
impl Display for Syllable {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", to_hiragana(self))
|
||||
}
|
||||
}
|
95193
src/test_assets/parolone.txt
Normal file
95193
src/test_assets/parolone.txt
Normal file
File diff suppressed because it is too large
Load diff
95193
src/test_assets/romanized.txt
Normal file
95193
src/test_assets/romanized.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue