Skip to content

Commit 60e0372

Browse files
committed
feat: handle multiple derivations for words in the metadata
1 parent 615014c commit 60e0372

File tree

9 files changed

+156
-28
lines changed

9 files changed

+156
-28
lines changed

harper-cli/src/main.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ enum Args {
7676
/// The document to mine words from.
7777
file: PathBuf,
7878
},
79+
/// Get the word associated with a particular word id.
80+
WordFromId { hash: u64 },
7981
}
8082

8183
fn main() -> anyhow::Result<()> {
@@ -227,6 +229,21 @@ fn main() -> anyhow::Result<()> {
227229

228230
println!("{json}");
229231

232+
// iterate through any and all derived_from and resolve the word from each wordid
233+
if let Some(metadata) = dictionary.get_word_metadata_str(&word) {
234+
if let Some(derived_from) = &metadata.derived_from {
235+
let derived_words: Vec<String> = derived_from
236+
.iter()
237+
.filter_map(|wordid| dictionary.get_word_from_id(wordid))
238+
.map(|word| word.iter().collect())
239+
.collect();
240+
241+
if !derived_words.is_empty() {
242+
println!("derived_from: {:?}", derived_words);
243+
}
244+
}
245+
}
246+
230247
Ok(())
231248
}
232249
Args::SummarizeLintRecord { file } => {
@@ -361,6 +378,11 @@ fn main() -> anyhow::Result<()> {
361378

362379
Ok(())
363380
}
381+
Args::WordFromId { hash } => {
382+
let id = WordId::from_hash(hash);
383+
println!("{:?}", dictionary.get_word_from_id(&id));
384+
Ok(())
385+
}
364386
}
365387
}
366388

@@ -402,9 +424,14 @@ fn print_word_derivations(word: &str, annot: &str, dictionary: &impl Dictionary)
402424

403425
let id = WordId::from_word_str(word);
404426

405-
let children = dictionary
406-
.words_iter()
407-
.filter(|e| dictionary.get_word_metadata(e).unwrap().derived_from == Some(id));
427+
let children = dictionary.words_iter().filter(|e| {
428+
dictionary
429+
.get_word_metadata(e)
430+
.unwrap()
431+
.derived_from
432+
.as_ref()
433+
.is_some_and(|derived| derived.contains(&id))
434+
});
408435

409436
println!(" - {}", word);
410437

harper-core/src/fat_token.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use crate::{CharStringExt, TokenKind};
44

55
/// A [`Token`](crate::Token) that holds its content as a fat [`Vec<char>`] rather than as a
66
/// [`Span`](crate::Span).
7-
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd, Hash, Eq)]
7+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Hash, Eq)]
88
pub struct FatToken {
99
pub content: Vec<char>,
1010
pub kind: TokenKind,
@@ -20,7 +20,7 @@ impl From<FatStringToken> for FatToken {
2020
}
2121

2222
/// Similar to a [`FatToken`], but uses a [`String`] as the underlying store.
23-
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd, Hash, Eq)]
23+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
2424
pub struct FatStringToken {
2525
pub content: String,
2626
pub kind: TokenKind,

harper-core/src/ignored_lints/lint_context.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use serde::{Deserialize, Serialize};
2+
use std::hash::{Hash, Hasher};
23

34
use crate::{
45
Document, FatToken,
@@ -7,7 +8,7 @@ use crate::{
78

89
/// A location-agnostic structure that attempts to captures the context and content that a [`Lint`]
910
/// occurred.
10-
#[derive(Debug, Hash, Serialize, Deserialize)]
11+
#[derive(Debug, Serialize, Deserialize)]
1112
pub struct LintContext {
1213
pub lint_kind: LintKind,
1314
pub suggestions: Vec<Suggestion>,
@@ -16,6 +17,16 @@ pub struct LintContext {
1617
pub tokens: Vec<FatToken>,
1718
}
1819

20+
impl Hash for LintContext {
21+
fn hash<H: Hasher>(&self, state: &mut H) {
22+
self.lint_kind.hash(state);
23+
self.suggestions.hash(state);
24+
self.message.hash(state);
25+
self.priority.hash(state);
26+
self.tokens.hash(state);
27+
}
28+
}
29+
1930
impl LintContext {
2031
pub fn from_lint(lint: &Lint, document: &Document) -> Self {
2132
let Lint {

harper-core/src/spell/fst_dictionary.rs

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -307,52 +307,55 @@ mod tests {
307307
#[test]
308308
fn plural_llamas_derived_from_llama() {
309309
let dict = FstDictionary::curated();
310-
311-
assert_eq!(
310+
assert!(
312311
dict.get_word_metadata_str("llamas")
313312
.unwrap()
314313
.derived_from
315-
.unwrap(),
316-
WordId::from_word_str("llama")
317-
)
314+
.as_ref()
315+
.unwrap()
316+
.contains(&WordId::from_word_str("llama"))
317+
);
318318
}
319319

320320
#[test]
321321
fn plural_cats_derived_from_cat() {
322322
let dict = FstDictionary::curated();
323323

324-
assert_eq!(
324+
assert!(
325325
dict.get_word_metadata_str("cats")
326326
.unwrap()
327327
.derived_from
328-
.unwrap(),
329-
WordId::from_word_str("cat")
328+
.as_ref()
329+
.unwrap()
330+
.contains(&WordId::from_word_str("cat"))
330331
);
331332
}
332333

333334
#[test]
334335
fn unhappy_derived_from_happy() {
335336
let dict = FstDictionary::curated();
336337

337-
assert_eq!(
338+
assert!(
338339
dict.get_word_metadata_str("unhappy")
339340
.unwrap()
340341
.derived_from
341-
.unwrap(),
342-
WordId::from_word_str("happy")
342+
.as_ref()
343+
.unwrap()
344+
.contains(&WordId::from_word_str("happy"))
343345
);
344346
}
345347

346348
#[test]
347349
fn quickly_derived_from_quick() {
348350
let dict = FstDictionary::curated();
349351

350-
assert_eq!(
352+
assert!(
351353
dict.get_word_metadata_str("quickly")
352354
.unwrap()
353355
.derived_from
354-
.unwrap(),
355-
WordId::from_word_str("quick")
356+
.as_ref()
357+
.unwrap()
358+
.contains(&WordId::from_word_str("quick"))
356359
);
357360
}
358361
}

harper-core/src/spell/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ mod rune;
1414
mod word_id;
1515
mod word_map;
1616

17-
#[derive(PartialEq, Debug, Hash, Eq)]
17+
#[derive(PartialEq, Debug, Eq)]
1818
pub struct FuzzyMatchResult<'a> {
1919
pub word: &'a [char],
2020
pub edit_distance: u8,

harper-core/src/spell/rune/attribute_list.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use hashbrown::HashMap;
1+
use hashbrown::{HashMap, HashSet};
22
use serde::{Deserialize, Serialize};
33
use smallvec::ToSmallVec;
44

@@ -89,11 +89,17 @@ impl AttributeList {
8989
);
9090
let t_metadata = dest.get_metadata_mut_chars(&new_word).unwrap();
9191
t_metadata.append(&metadata);
92-
t_metadata.derived_from = Some(WordId::from_word_chars(&word.letters))
92+
t_metadata
93+
.derived_from
94+
.get_or_insert_with(HashSet::new)
95+
.insert(WordId::from_word_chars(&word.letters));
9396
}
9497
} else {
9598
for (key, mut value) in new_words.into_iter() {
96-
value.derived_from = Some(WordId::from_word_chars(&word.letters));
99+
value
100+
.derived_from
101+
.get_or_insert_with(HashSet::new)
102+
.insert(WordId::from_word_chars(&word.letters));
97103

98104
if let Some(val) = dest.get_metadata_mut_chars(&key) {
99105
val.append(&value);

harper-core/src/spell/word_id.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,8 @@ impl WordId {
3131
let chars: CharString = text.as_ref().chars().collect();
3232
Self::from_word_chars(chars)
3333
}
34+
35+
pub fn from_hash(hash: u64) -> Self {
36+
Self { hash }
37+
}
3438
}

harper-core/src/token_kind.rs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ use is_macro::Is;
22
use serde::{Deserialize, Serialize};
33

44
use crate::{ConjunctionData, NounData, Number, PronounData, Punctuation, Quote, WordMetadata};
5+
use std::hash::{Hash, Hasher};
56

6-
#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
7+
#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, Eq, PartialEq)]
78
#[serde(tag = "kind", content = "value")]
89
pub enum TokenKind {
910
/// `None` if the word does not exist in the dictionary.
@@ -26,6 +27,49 @@ pub enum TokenKind {
2627
Regexish,
2728
}
2829

30+
impl Hash for TokenKind {
31+
fn hash<H: Hasher>(&self, state: &mut H) {
32+
match self {
33+
TokenKind::Word(metadata) => {
34+
metadata.hash(state);
35+
}
36+
TokenKind::Punctuation(punct) => {
37+
punct.hash(state);
38+
}
39+
TokenKind::Decade => {
40+
0.hash(state);
41+
}
42+
TokenKind::Number(number) => {
43+
number.hash(state);
44+
}
45+
TokenKind::Space(space) => {
46+
space.hash(state);
47+
}
48+
TokenKind::Newline(newline) => {
49+
newline.hash(state);
50+
}
51+
TokenKind::EmailAddress => {
52+
0.hash(state);
53+
}
54+
TokenKind::Url => {
55+
0.hash(state);
56+
}
57+
TokenKind::Hostname => {
58+
0.hash(state);
59+
}
60+
TokenKind::Unlintable => {
61+
0.hash(state);
62+
}
63+
TokenKind::ParagraphBreak => {
64+
0.hash(state);
65+
}
66+
TokenKind::Regexish => {
67+
0.hash(state);
68+
}
69+
}
70+
}
71+
}
72+
2973
impl TokenKind {
3074
pub fn is_open_square(&self) -> bool {
3175
matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))

harper-core/src/word_metadata.rs

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
use hashbrown::HashSet;
2+
use std::hash::{Hash, Hasher};
3+
14
use is_macro::Is;
25
use paste::paste;
36
use serde::{Deserialize, Serialize};
47
use strum_macros::{Display, EnumString};
58

69
use crate::WordId;
710

8-
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
11+
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
912
pub struct WordMetadata {
1013
pub noun: Option<NounData>,
1114
pub pronoun: Option<PronounData>,
@@ -28,7 +31,28 @@ pub struct WordMetadata {
2831
#[serde(default = "default_false")]
2932
pub common: bool,
3033
#[serde(default = "default_none")]
31-
pub derived_from: Option<WordId>,
34+
pub derived_from: Option<HashSet<WordId>>,
35+
}
36+
37+
impl Hash for WordMetadata {
38+
fn hash<H: Hasher>(&self, state: &mut H) {
39+
self.noun.hash(state);
40+
self.pronoun.hash(state);
41+
self.verb.hash(state);
42+
self.adjective.hash(state);
43+
self.adverb.hash(state);
44+
self.conjunction.hash(state);
45+
self.swear.hash(state);
46+
self.dialect.hash(state);
47+
self.determiner.hash(state);
48+
self.preposition.hash(state);
49+
self.common.hash(state);
50+
if let Some(ref derived_from) = self.derived_from {
51+
for id in derived_from.iter() {
52+
id.hash(state);
53+
}
54+
}
55+
}
3256
}
3357

3458
/// Needed for `serde`
@@ -111,7 +135,16 @@ impl WordMetadata {
111135
determiner: self.determiner || other.determiner,
112136
preposition: self.preposition || other.preposition,
113137
common: self.common || other.common,
114-
derived_from: self.derived_from.or(other.derived_from),
138+
derived_from: match (&self.derived_from, &other.derived_from) {
139+
(Some(a), Some(b)) => {
140+
let mut set = a.clone();
141+
set.extend(b);
142+
Some(set)
143+
}
144+
(Some(a), None) => Some(a.clone()),
145+
(None, Some(b)) => Some(b.clone()),
146+
(None, None) => None,
147+
},
115148
}
116149
}
117150

0 commit comments

Comments
 (0)