fix incorrect prunning of composite words
This commit is contained in:
parent
f5e8aa12d9
commit
e5b23bfd16
1 changed files with 21 additions and 2 deletions
|
@ -17,8 +17,8 @@ export class Dict {
|
|||
for (const word of content.split("\n")) {
|
||||
const word_ = word.trim().toLowerCase();
|
||||
if (word_.length !== length) continue;
|
||||
for (const forbidden of [" ", "-", "."]) if (word_.includes(forbidden)) continue;
|
||||
words.add(remove_accent(word_));
|
||||
if (contains_any(word_, [" ", "-", "."])) continue;
|
||||
words.add(remove_accents(word_));
|
||||
}
|
||||
return new Dict(words, length);
|
||||
}
|
||||
|
@ -27,3 +27,22 @@ export class Dict {
|
|||
return `Dict { ${this.words.size} words }`;
|
||||
}
|
||||
}
|
||||
|
||||
export function contains_any(text: string, words: string[]) {
|
||||
for (const word of words) if (text.includes(word)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
export function remove_accents(text: string) {
|
||||
const accents = [
|
||||
...[["à", "a"], ["â", "a"], ["ä", "a"]],
|
||||
...[["ç", "c"]],
|
||||
...[["é", "e"], ["è", "e"], ["ê", "e"], ["ë", "e"]],
|
||||
...[["î", "i"], ["ï", "i"]],
|
||||
...[["ô", "o"], ["ö", "o"]],
|
||||
...[["û", "u"]],
|
||||
];
|
||||
let result = text;
|
||||
for (const [accent, alternative] of accents) result = result.replaceAll(accent, alternative);
|
||||
return result;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue