diff --git a/docs/field/keyword_text_delimited_field.rst b/docs/field/keyword_text_delimited_field.rst new file mode 100644 index 000000000..e5cc92ca7 --- /dev/null +++ b/docs/field/keyword_text_delimited_field.rst @@ -0,0 +1,10 @@ +.. _keyword_text_delimited_field: + +KeywordTextDelimitedField +========================== + +.. module:: pyatlan.model.fields.atlan_fields + :no-index: + +.. autoclass:: KeywordTextDelimitedField + :inherited-members: diff --git a/docs/fields.rst b/docs/fields.rst index ee1327cb5..2b9d2de66 100644 --- a/docs/fields.rst +++ b/docs/fields.rst @@ -27,5 +27,6 @@ Subclasses: field/numeric_field field/numeric_rank_field field/keyword_text_field + field/keyword_text_delimited_field field/keyword_text_stemmed_field field/custom_metadata_field diff --git a/pyatlan/generator/class_generator.py b/pyatlan/generator/class_generator.py index 19eb73cec..c8c92a225 100644 --- a/pyatlan/generator/class_generator.py +++ b/pyatlan/generator/class_generator.py @@ -577,6 +577,7 @@ class IndexType(Enum): BOOLEAN = enum.auto() NUMERIC = enum.auto() STEMMED = enum.auto() + DELIMITER = enum.auto() RELATION = enum.auto() @@ -634,6 +635,9 @@ def get_indexes_for_attribute() -> Dict[IndexType, str]: searchable[IndexType.STEMMED] = attr_name else: searchable[IndexType.TEXT] = attr_name + elif analyzer == "atlan_text_analyzer_v2": + # Delimiter index uses atlan_text_analyzer_v2 + searchable[IndexType.DELIMITER] = attr_name elif attr_def.get("indexType") == "STRING": searchable[IndexType.KEYWORD] = attr_name else: @@ -647,10 +651,28 @@ def get_indexes_for_attribute() -> Dict[IndexType, str]: if index_type == "keyword": searchable[IndexType.KEYWORD] = field_name elif index_type == "text": - if field_name.endswith(".stemmed"): - searchable[IndexType.STEMMED] = field_name - else: - searchable[IndexType.TEXT] = field_name + # Skip adding TEXT index for description/userDescription subfields + # as they already have a main TEXT index + if attr_name in ( + "description", + "userDescription", + ) and field_suffix in ("text", "keyword"): + continue + + if field_config := fields.get(field_suffix): + if field_analyzer := field_config.get("analyzer"): + if field_analyzer == "atlan_text_analyzer_v2": + searchable[IndexType.DELIMITER] = field_name + elif field_analyzer == "atlan_text_analyzer": + if field_name.endswith(".stemmed"): + searchable[IndexType.STEMMED] = field_name + else: + searchable[IndexType.TEXT] = field_name + else: + if field_name.endswith(".stemmed"): + searchable[IndexType.STEMMED] = field_name + else: + searchable[IndexType.TEXT] = field_name elif index_type == "rank_feature": searchable[IndexType.RANK_FEATURE] = field_name else: @@ -684,6 +706,13 @@ def get_indexes_for_attribute() -> Dict[IndexType, str]: name="KeywordTextField", args=f'"{search_map.get(IndexType.KEYWORD)}", "{search_map.get(IndexType.TEXT)}"', ) + elif indices == {IndexType.KEYWORD, IndexType.TEXT, IndexType.DELIMITER}: + return SearchType( + name="KeywordTextDelimitedField", + args=f'"{search_map.get(IndexType.KEYWORD)}", ' + f'"{search_map.get(IndexType.TEXT)}", ' + f'"{search_map.get(IndexType.DELIMITER)}"', + ) elif indices == {IndexType.KEYWORD, IndexType.TEXT, IndexType.STEMMED}: return SearchType( name="KeywordTextStemmedField", diff --git a/pyatlan/generator/templates/imports.jinja2 b/pyatlan/generator/templates/imports.jinja2 index 07b7b0af9..cfeb6609e 100644 --- a/pyatlan/generator/templates/imports.jinja2 +++ b/pyatlan/generator/templates/imports.jinja2 @@ -82,6 +82,7 @@ from pyatlan.model.fields.atlan_fields import ( InternalNumericField, KeywordField, KeywordTextField, + KeywordTextDelimitedField, KeywordTextStemmedField, NumericField, NumericRankField, diff --git a/pyatlan/model/fields/atlan_fields.py b/pyatlan/model/fields/atlan_fields.py index 5b2c66699..0f8dde577 100644 --- a/pyatlan/model/fields/atlan_fields.py +++ b/pyatlan/model/fields/atlan_fields.py @@ -559,6 +559,45 @@ def text_field_name(self) -> str: return self._text_field_name +class KeywordTextDelimitedField(KeywordTextField): + """ + Represents any field in Atlan that can be searched by keyword or text-based search operations, + including a delimited variation using the atlan_text_analyzer_v2 analyzer. + """ + + delimited_field_name: StrictStr + + def __init__( + self, + atlan_field_name: StrictStr, + keyword_field_name: StrictStr, + text_field_name: StrictStr, + delimited_field_name: StrictStr, + ): + """ + Default constructor. + + :param atlan_field_name: name of the attribute in the metastore + :param keyword_field_name: name of the keyword field in the search index + :param text_field_name: name of the text field in the search index + :param delimited_field_name: name of the delimited text field in the search index + """ + super().__init__(atlan_field_name, keyword_field_name, text_field_name) + self.delimited_field_name = delimited_field_name + + def match_delimited(self, value: StrictStr) -> Query: + """ + Returns a query that will textually match the provided value against the field. This + analyzes the provided value according to the atlan_text_analyzer_v2 analyzer which + tokenizes on common delimiters. + + :param value: the string value to match against + :returns: a query that will only match assets whose analyzed value for the field matches the value provided + (which will also be analyzed using the delimited analyzer) + """ + return Match(field=self.delimited_field_name, query=value) + + class InternalKeywordTextField(KeywordTextField): """Represents any field in Atlan that can be searched by keyword or text-based search operations, and can also be searched against a special internal field directly within Atlan."""