diff --git a/docs/models.py b/docs/models.py index 00ed0628ca..07f2174eab 100644 --- a/docs/models.py +++ b/docs/models.py @@ -15,7 +15,7 @@ TrigramSimilarity, ) from django.core.cache import cache -from django.db import models, transaction +from django.db import connection, models, transaction from django.db.models import Prefetch, Q from django.db.models.fields.json import KeyTextTransform from django.utils.functional import cached_property @@ -174,6 +174,18 @@ def sync_to_db(self, decoded_documents): if line.startswith(f"Disallow: /{self.lang}/{self.release_id}/") ] + language_mapping = TSEARCH_CONFIG_LANGUAGES + english = "custom_english" + with connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS(SELECT 1 FROM pg_ts_config WHERE cfgname = %s)", + [english], + ) + has_custom_english_config = cursor.fetchone()[0] + + if has_custom_english_config: + language_mapping["en"] = english + for document in decoded_documents: if ( "body" not in document @@ -192,9 +204,7 @@ def sync_to_db(self, decoded_documents): path=document_path, title=html.unescape(strip_tags(document["title"])), metadata=document, - config=TSEARCH_CONFIG_LANGUAGES.get( - self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG - ), + config=language_mapping.get(self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG), ) for document in self.documents.all(): document.metadata["breadcrumbs"] = list( diff --git a/docs/stopwords/README.md b/docs/stopwords/README.md new file mode 100644 index 0000000000..ab24e09a63 --- /dev/null +++ b/docs/stopwords/README.md @@ -0,0 +1,38 @@ +# Instructions to create a new search dictionary + +In this folder, there is `custom_english.stop`. + +This copies the [snowball english stop words](https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop) +but removes some stop words such as "through" and "when". This is because these +terms are also used in Django code. + +The file format is a list of words, one per line. Blank lines and trailing +spaces are ignored, and upper case is folded to lower case, but no other +processing is done on the file contents. + +This file needs to be created in `$SHAREDIR/tsearch_data/custom_english.stop`, +where `$SHAREDIR` means the PostgreSQL installation's shared-data directory, +available via `pg_config --sharedir`. + +See https://www.postgresql.org/docs/current/textsearch-dictionaries.html + +Once the custom stop words file has been created, we can run the following SQL: + +```sql +CREATE TEXT SEARCH DICTIONARY english_custom ( + TEMPLATE = snowball, + Language = english, + StopWords = english_custom +); + +CREATE TEXT SEARCH CONFIGURATION public.english_custom ( + COPY = pg_catalog.english +); + +ALTER TEXT SEARCH CONFIGURATION public.english_custom + ALTER MAPPING + FOR asciiword, asciihword, hword_asciipart, hword, hword_part, word + WITH english_custom; +``` + +This should then mean the `english_custom` search dictionary is available. diff --git a/docs/stopwords/custom_english.stop b/docs/stopwords/custom_english.stop new file mode 100644 index 0000000000..3a7522ef87 --- /dev/null +++ b/docs/stopwords/custom_english.stop @@ -0,0 +1,119 @@ +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +a +an +the +and +but +or +because +as +until +while +of +at +by +about +against +between +into +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +where +why +how +any +both +each +few +more +most +other +some +such +no +nor +not +own +same +so +than +too +very +s +t +can +will +just +don +should