From 95fdd3b40364e2138006dd288ac269ff0f13ed5b Mon Sep 17 00:00:00 2001 From: Lukas Kalbertodt Date: Wed, 18 Dec 2024 15:49:03 +0100 Subject: [PATCH] Add stop words to Meili search index Filtering out very common words that carry basically no information improves indexing performance, shrinks the index and most importantly: helps with the problem that searching for common words results in tons of matches in subtitles and such. This doesn't completely solve the latter problem though. And using stop words also makes things worse unfortunately: especially in phrase search, the highlighting is broken and might confuse users. Phrase search still kind of works but from reading the docs, I think with stop search "the" and "a", searching for "foo the bar" will also find documents with the text "foo a bar". So it's not really clear yet whether we want that at all. Maybe Meili needs to improve first. Or we never send the stop words to Meili and only use them to filter some stuff manually? --- backend/src/search/event.rs | 2 +- backend/src/search/mod.rs | 2 +- backend/src/search/playlist.rs | 2 +- backend/src/search/realm.rs | 2 +- backend/src/search/series.rs | 2 +- backend/src/search/stop-words.txt | 396 ++++++++++++++++++++++++++++++ backend/src/search/user.rs | 2 +- backend/src/search/util.rs | 16 +- util/scripts/check-system.sh | 2 +- 9 files changed, 418 insertions(+), 8 deletions(-) create mode 100644 backend/src/search/stop-words.txt diff --git a/backend/src/search/event.rs b/backend/src/search/event.rs index 8fa8e6400..c505dbad9 100644 --- a/backend/src/search/event.rs +++ b/backend/src/search/event.rs @@ -139,7 +139,7 @@ impl Event { } pub(super) async fn prepare_index(index: &Index) -> Result<()> { - util::lazy_set_special_attributes(index, "event", FieldAbilities { + util::lazy_set_special_attributes(index, "event", true, FieldAbilities { searchable: &[ "title", "creators", diff --git a/backend/src/search/mod.rs b/backend/src/search/mod.rs index c42f33d43..7525eab32 100644 --- a/backend/src/search/mod.rs +++ b/backend/src/search/mod.rs @@ -336,7 +336,7 @@ pub(crate) async fn rebuild_if_necessary( for task in tasks { util::wait_on_task(task, meili).await?; } - info!("Completely rebuild search index"); + info!("Completely rebuilt search index"); meili.meta_index.add_or_replace(&[meta::Meta::current_clean()], None).await .context("failed to update index version document (clean)")?; diff --git a/backend/src/search/playlist.rs b/backend/src/search/playlist.rs index 62b276f79..0a9677a19 100644 --- a/backend/src/search/playlist.rs +++ b/backend/src/search/playlist.rs @@ -90,7 +90,7 @@ impl Playlist { } pub(super) async fn prepare_index(index: &Index) -> Result<()> { - util::lazy_set_special_attributes(index, "playlist", FieldAbilities { + util::lazy_set_special_attributes(index, "playlist", true, FieldAbilities { searchable: &["title", "description"], filterable: &["read_roles", "write_roles"], sortable: &["updated_timestamp"], diff --git a/backend/src/search/realm.rs b/backend/src/search/realm.rs index d238cfd22..85d6d9468 100644 --- a/backend/src/search/realm.rs +++ b/backend/src/search/realm.rs @@ -71,7 +71,7 @@ impl Realm { } pub(super) async fn prepare_index(index: &Index) -> Result<()> { - util::lazy_set_special_attributes(index, "realm", FieldAbilities { + util::lazy_set_special_attributes(index, "realm", false, FieldAbilities { searchable: &["name"], filterable: &["is_root", "is_user_realm"], sortable: &[], diff --git a/backend/src/search/series.rs b/backend/src/search/series.rs index f2acdce0c..c7c3e2bfd 100644 --- a/backend/src/search/series.rs +++ b/backend/src/search/series.rs @@ -94,7 +94,7 @@ impl Series { } pub(super) async fn prepare_index(index: &Index) -> Result<()> { - util::lazy_set_special_attributes(index, "series", FieldAbilities { + util::lazy_set_special_attributes(index, "series", true, FieldAbilities { searchable: &["title", "description"], filterable: &["listed", "read_roles", "write_roles"], sortable: &["updated_timestamp"], diff --git a/backend/src/search/stop-words.txt b/backend/src/search/stop-words.txt new file mode 100644 index 000000000..d45d410eb --- /dev/null +++ b/backend/src/search/stop-words.txt @@ -0,0 +1,396 @@ +# Single latin letters +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +# English +# 'a' and 'i' are already covered by single letters above. +# Based on NLTK's list of english stopwords +about +above +#after -> German word +again +against +all +am +an +and +any +are +as +at +be +because +been +before +being +below +between +both +but +by +can +could +did +do +does +doing +dont +down +during +each +few +for +from +further +had +has +have +having +he +her +here +hers +herself +him +himself +his +how +however +if +in +into +is +it +its +itself +just +like +many +me +more +#most -> German word +must +my +myself +no +nor +not # -> German word, not super common as stand-alone word and very much a English stop word, so we keep it +now +of +off +on +once +only +or +other +our +ours +ourselves +out +over +own +said +same +she +should +so +some +such # -> German word, but probably fine to keep it a stop word +than +that +the +their +theirs +them +themselves +then +there +#these -> German word +they +this +those +through +to +too +under +until +up +using +very +was +we +were +what +when +where +which +while +who +whom +why +will +with +would +you +your +yours +yourself +yourselves + + +# German +aber +alle +allem +allen +aller +alles +als +also #-> English word but also kind of stop-wordy, so keeping it +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +#bin -> english word +bis +bist +da +damit +dann +der +den +des +dem +#die -> english word +das +dass +daß +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +#dies -> english word +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gab +gegen +gewesen +hab +habe +haben +#hat -> English word +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kam +kann +kein +keine +keinem +keinen +keiner +keines +konnte +können +könnte +machen +#man -> English word +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +#nun -> English word +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +sowie +über +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +während +#war -> English word +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +wurde +wurden +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/backend/src/search/user.rs b/backend/src/search/user.rs index a34f87ff7..3786a6079 100644 --- a/backend/src/search/user.rs +++ b/backend/src/search/user.rs @@ -64,7 +64,7 @@ impl User { } pub(super) async fn prepare_index(index: &Index) -> Result<()> { - util::lazy_set_special_attributes(index, "user", FieldAbilities { + util::lazy_set_special_attributes(index, "user", false, FieldAbilities { searchable: &["display_name"], filterable: &[], sortable: &[], diff --git a/backend/src/search/util.rs b/backend/src/search/util.rs index 4292ce87a..a3df95cd4 100644 --- a/backend/src/search/util.rs +++ b/backend/src/search/util.rs @@ -1,4 +1,4 @@ -use std::time::Duration; +use std::{sync::LazyLock, time::Duration}; use meilisearch_sdk::{errors::{Error, ErrorCode}, indexes::Index, tasks::Task, task_info::TaskInfo}; @@ -22,6 +22,7 @@ pub(super) struct FieldAbilities<'a> { pub(super) async fn lazy_set_special_attributes( index: &Index, index_name: &str, + stop_words: bool, fields: FieldAbilities<'_>, ) -> Result<()> { if index.get_searchable_attributes().await? != fields.searchable { @@ -39,9 +40,22 @@ pub(super) async fn lazy_set_special_attributes( index.set_sortable_attributes(fields.sortable).await?; } + if stop_words && index.get_stop_words().await?.iter().ne(&*STOP_WORDS) { + debug!("Updating stop words of {index_name} index"); + index.set_stop_words(&*STOP_WORDS).await?; + } + Ok(()) } +static STOP_WORDS: LazyLock> = LazyLock::new(|| { + const RAW: &str = include_str!("stop-words.txt"); + RAW.lines() + .map(|l| l.split('#').next().unwrap().trim()) + .filter(|s| !s.is_empty()) + .collect() +}); + /// Encodes roles inside an ACL (e.g. for an event) to be stored in the index. /// The roles are hex encoded to be filterable properly with Meili's /// case-insensitive filtering. Also, `ROLE_ADMIN` is removed as an space diff --git a/util/scripts/check-system.sh b/util/scripts/check-system.sh index e887af598..3c9878df5 100755 --- a/util/scripts/check-system.sh +++ b/util/scripts/check-system.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -MIN_RUST_VERSION="1.74.0" +MIN_RUST_VERSION="1.80.0" MIN_NPM_VERSION="7.0" has_command() {