Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 40 additions & 13 deletions redash/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import datetime
import logging
import numbers
import re
import time

import pytz
Expand Down Expand Up @@ -644,6 +645,43 @@ def outdated_queries(cls):

return list(outdated_queries.values())

@classmethod
def _do_multi_byte_search(cls, all_queries, term, limit=None):
# term examples:
# - word
# - name:word
# - query:word
# - "multiple words"
# - name:"multiple words"
# - word1 word2 word3
# - word1 "multiple word" query:"select foo"
tokens = re.findall(r'(?:([^:\s]+):)?(?:"([^"]+)"|(\S+))', term)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better way to break words and "multiple words" into tokens? This regex appears to work, may also contain hard-to-find edge cases

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eradman

Thank you for the review!

At first, the query string matches one of those.

  • Qualifier:Keyword => (?:([^:\s]+):) AND (?:"([^"]+)"|(\S+))
  • Keyword => (?:"([^"]+)"|(\S+))

And, the keyword matches one of those:

  • "any characters" => "([^"]+)"
  • NonSpaceCharacters => (\S+)

I think this covers the all 4 cases( 2 cases x 2 cases).

Or, we may implement simple tokenizer. That makes more flexible, but it need a bit of code.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or, we may implement simple tokenizer. That makes more flexible, but it need a bit of code.

The regex is very compact. We can refactor this in the future if we need more flexibility

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eradman

I see. Thank you !

conditions = []
for token in tokens:
key = None
if token[0]:
key = token[0]

if token[1]:
value = token[1]
else:
value = token[2]

pattern = f"%{value}%"

if key == "id" and value.isdigit():
conditions.append(cls.id.equal(int(value)))
elif key == "name":
conditions.append(cls.name.ilike(pattern))
elif key == "query":
conditions.append(cls.query_text.ilike(pattern))
elif key == "description":
conditions.append(cls.description.ilike(pattern))
else:
conditions.append(or_(cls.name.ilike(pattern), cls.description.ilike(pattern)))

return all_queries.filter(and_(*conditions)).order_by(Query.id).limit(limit)

@classmethod
def search(
cls,
Expand All @@ -664,12 +702,7 @@ def search(

if multi_byte_search:
# Since tsvector doesn't work well with CJK languages, use `ilike` too
pattern = "%{}%".format(term)
return (
all_queries.filter(or_(cls.name.ilike(pattern), cls.description.ilike(pattern)))
.order_by(Query.id)
.limit(limit)
)
return cls._do_multi_byte_search(all_queries, term, limit)

# sort the result using the weight as defined in the search vector column
return all_queries.search(term, sort=True).limit(limit)
Expand All @@ -678,13 +711,7 @@ def search(
def search_by_user(cls, term, user, limit=None, multi_byte_search=False):
if multi_byte_search:
# Since tsvector doesn't work well with CJK languages, use `ilike` too
pattern = "%{}%".format(term)
return (
cls.by_user(user)
.filter(or_(cls.name.ilike(pattern), cls.description.ilike(pattern)))
.order_by(Query.id)
.limit(limit)
)
return cls._do_multi_byte_search(cls.by_user(user), term, limit)

return cls.by_user(user).search(term, sort=True).limit(limit)

Expand Down
Loading