Skip to content

Commit 5fbfd28

Browse files
authored
Merge pull request #143 from MITLibraries/tco-113-primo-preprocessor
Adds preprocessor for incoming primo searches
2 parents fa5bb3b + ff94e13 commit 5fbfd28

File tree

4 files changed

+221
-1
lines changed

4 files changed

+221
-1
lines changed

app/models/preprocessor_primo.rb

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# frozen_string_literal: true
2+
3+
# PreprocessorPrimo handles manipulating incoming data from the Primo UI into a structure that TACOS can work with
4+
class PreprocessorPrimo
5+
# to_tacos processes raw incoming query from Primo, looks at each part to see if it is a keyword anywhere search
6+
# Any portion that is not a keyword anywhere search drops the entire search from TACOS, logging
7+
# as the shared Term `unhandled complex primo query` to allow us to track how frequently we are
8+
# dropping terms so we can come back later to build out more complex handing if this is common enough
9+
# to warrant the additional work.
10+
# @param query [String] example `any,contains,this is a keyword search`
11+
def self.to_tacos(query)
12+
# Primo and TACOS agreed upon joiner is `;;;`
13+
split_query = query.split(';;;')
14+
15+
if split_query.count > 1
16+
Rails.logger.debug('Multipart primo query detected')
17+
18+
# As we are not currently handling complex queries, always set the value to something we can track frequency of
19+
'unhandled complex primo query'
20+
else
21+
Rails.logger.debug('Simple primo query detected')
22+
23+
extract_keyword(query)
24+
end
25+
end
26+
27+
# keyword? confirms whether a portion of a primo query is a keyword search
28+
# Note: we expect only 3 elements to this array for simple keyword searches and that arrays created from the Primo
29+
# input to be collapsed so commas in the original search have been handled via the comma_handler method
30+
# @param query_part_array [Array] example ['any', 'contains', 'this is a keyword search']
31+
# @return [Boolean]
32+
def self.keyword?(query_part_array)
33+
return false unless query_part_array.count == 3
34+
return false unless query_part_array[0] == 'any'
35+
36+
# For now, we are allowing all variants of the second portion of the primo query input
37+
# The expected values are: contains, exact, begins_with, equals
38+
# Uncommenting the following statement would allow us to restrict to just the default 'contains' if desireable
39+
#
40+
# return false unless query_part_array[1] == 'contains'
41+
42+
true
43+
end
44+
45+
# extract_keyword works at the level of a single keyword query input coming from primo and
46+
# returns a string with just that keyword with the operators removed
47+
# @param query_part [String] example `any,contains,this is a keyword search`
48+
# @return [String] the extracted keyword phrase
49+
def self.extract_keyword(query_part)
50+
query_part_array = query_part.split(',')
51+
52+
# We don't anticipate this being a normal state so we are tracking it under the Term `invalid primo query` as well
53+
# as sending an exception to Sentry so we can understand the context in which this happens if it does
54+
if query_part_array.count < 3
55+
Sentry.capture_message('PreprocessorPrimo: Invalid Primo query during keyword extraction')
56+
return 'invalid primo query'
57+
end
58+
59+
the_keywords = join_keyword_and_drop_extra_parts(query_part_array)
60+
61+
return 'unhandled complex primo query' unless keyword?([query_part_array[0], query_part_array[1], the_keywords])
62+
63+
the_keywords
64+
end
65+
66+
# join_keyword_and_drop_extra_parts handles the logic necessary to join searches that contain commas into a single ruby string
67+
# after we separate the incoming string into an array based on commas
68+
# @param query_part [String] example `['any', 'contains', 'this', 'is', 'a', 'keyword', 'search']`
69+
# @return [String] example 'this,is,a,keyword,search'
70+
def self.join_keyword_and_drop_extra_parts(query_part_array)
71+
# For complex queries, which we are not handling yet, we'll need to determine how TACOS should handle the final
72+
# element of the input which will be a boolean operator. For now, we will have stopped processing those by this
73+
# point during the initial logic in `to_tacos` that splits on `;;` and returns if the result is more than one query
74+
query_part_array.slice(2..).join(',')
75+
end
76+
end

app/models/search_logger.rb

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,24 @@ class SearchLogger
66
# Receives a phrase and source and creates a search event. Will find or create a term as needed.
77
# @return [SearchEvent] the newly created SearchEvent
88
def self.logevent(phrase, source)
9-
term = Term.create_or_find_by!(phrase:)
9+
term = Term.create_or_find_by!(phrase: extract_phrase(phrase, source))
1010
term.calculate_categorizations
1111
term.search_events.create!(source:)
1212
end
13+
14+
# Coordinates `phrase` extraction from incoming data from each `source`. If no `source` is matched,
15+
# passes through incoming `phrase`.
16+
# Note: as it may become useful to test in a production environment, we match on patterns of sources
17+
# rather than exact string matches. Example: `primo`, `primo-testing`, `primo-playground` are all handled
18+
# with the same case.
19+
def self.extract_phrase(phrase, source)
20+
case source
21+
when /primo/
22+
Rails.logger.debug('Primo case detected')
23+
PreprocessorPrimo.to_tacos(phrase)
24+
else
25+
Rails.logger.debug('default case detected')
26+
phrase
27+
end
28+
end
1329
end

test/controllers/graphql_controller_test.rb

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,4 +214,30 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest
214214
assert_equal 'Transactional', json['data']['lookupTerm']['categories'].first['name']
215215
assert_in_delta 0.95, json['data']['lookupTerm']['categories'].first['confidence']
216216
end
217+
218+
test 'primo searches use the preprocessor to extract actual keywords' do
219+
post '/graphql', params: { query: '{
220+
logSearchEvent(sourceSystem: "primo-test",
221+
searchTerm: "any,contains,Super cool search") {
222+
phrase
223+
}
224+
}' }
225+
226+
json = response.parsed_body
227+
228+
assert_equal 'Super cool search', json['data']['logSearchEvent']['phrase']
229+
end
230+
231+
test 'primo searches use the preprocessor and logs complex queries to a specific term' do
232+
post '/graphql', params: { query: '{
233+
logSearchEvent(sourceSystem: "primo-test",
234+
searchTerm: "any,contains,Super cool search;;;any,contains,uh oh this is getting complicated") {
235+
phrase
236+
}
237+
}' }
238+
239+
json = response.parsed_body
240+
241+
assert_equal 'unhandled complex primo query', json['data']['logSearchEvent']['phrase']
242+
end
217243
end
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
require 'test_helper'
5+
6+
class PreprocessorPrimoTest < ActiveSupport::TestCase
7+
test 'to_tacos returns unhandled for complex queries' do
8+
input = 'any,contains,space;;;any,contains,madness'
9+
10+
assert_equal('unhandled complex primo query', PreprocessorPrimo.to_tacos(input))
11+
end
12+
13+
test 'to_tacos returns unhandled for targeted field queries' do
14+
input = 'title,contains,space'
15+
16+
assert_equal('unhandled complex primo query', PreprocessorPrimo.to_tacos(input))
17+
end
18+
19+
test 'to_tacos returns phrase ready for tacos for simple keyword input' do
20+
input = 'any,contains,space'
21+
22+
assert_equal('space', PreprocessorPrimo.to_tacos(input))
23+
end
24+
25+
test 'to_tacos returns phrase ready for complex keyword input' do
26+
input = 'any,contains,Yan, F., Krantz, P., Sung, Y., Kjaergaard, M., Campbell, D.L., Orlando, T.P., Gustavsson, S. and Oliver, W.D., 2018. Tunable coupling scheme for implementing high-fidelity two-qubit gates. Physical Review Applied, 10(5), p.054062.'
27+
expected = 'Yan, F., Krantz, P., Sung, Y., Kjaergaard, M., Campbell, D.L., Orlando, T.P., Gustavsson, S. and Oliver, W.D., 2018. Tunable coupling scheme for implementing high-fidelity two-qubit gates. Physical Review Applied, 10(5), p.054062.'
28+
29+
assert_equal(expected, PreprocessorPrimo.to_tacos(input))
30+
end
31+
32+
test 'keyword? returns true for any contains phrase pattern' do
33+
input = 'any,contains,popcorn anomoly'.split(',')
34+
35+
assert(PreprocessorPrimo.keyword?(input))
36+
end
37+
38+
test 'keyword? returns false for input with more than 3 array elements' do
39+
# NOTE: this query entering tacos would work... but it would have been cleaned up prior to running
40+
# keyword? in our application via the normal flow
41+
input = 'any,contains,popcorn anomoly: why life on the moon is complex, and other cat facts'.split(',')
42+
43+
assert_not(PreprocessorPrimo.keyword?(input))
44+
end
45+
46+
test 'keyword? returns false for input with less than 3 array elements' do
47+
input = 'any,contains'.split(',')
48+
49+
assert_not(PreprocessorPrimo.keyword?(input))
50+
end
51+
52+
test 'keyword? returns false for non-any input' do
53+
input = 'title,contains,popcorn anomoly'.split(',')
54+
55+
assert_not(PreprocessorPrimo.keyword?(input))
56+
end
57+
58+
test 'keyword? returns true for non-contains inputs' do
59+
# NOTE: this portion of they primo query focuses on how to handle the phrase. All the words, any of the words,
60+
# the exact phrase, begins_with. For now we treat them all the same as standard keyword queries.
61+
input = 'any,exact,popcorn anomoly'.split(',')
62+
63+
assert(PreprocessorPrimo.keyword?(input))
64+
end
65+
66+
test 'extract keyword returns keyword for simple keywords' do
67+
input = 'any,contains,popcorn anomoly'
68+
69+
assert_equal('popcorn anomoly', PreprocessorPrimo.extract_keyword(input))
70+
end
71+
72+
test 'extract keyword returns keyword for simple non-contains keywords' do
73+
input = 'any,exact,popcorn anomoly'
74+
75+
assert_equal('popcorn anomoly', PreprocessorPrimo.extract_keyword(input))
76+
end
77+
78+
test 'extract keyword returns unhandled complex primo query for non-any searches' do
79+
input = 'title,contains,popcorn anomoly'
80+
81+
assert_equal('unhandled complex primo query', PreprocessorPrimo.extract_keyword(input))
82+
end
83+
84+
test 'extract keyword returns keyword for keywords with punctuation' do
85+
input = 'any,contains,popcorn anomoly: a cats! life. on & mars!'
86+
87+
assert_equal('popcorn anomoly: a cats! life. on & mars!', PreprocessorPrimo.extract_keyword(input))
88+
end
89+
90+
test 'extract keyword returns keyword for keywords with commas' do
91+
input = 'any,contains,popcorn anomoly, and so can you'
92+
93+
assert_equal('popcorn anomoly, and so can you', PreprocessorPrimo.extract_keyword(input))
94+
end
95+
96+
test 'extract keyword returns keyword for keywords with multiple commas and other punctuation' do
97+
input = 'any,contains,popcorn anomoly: a cats! life. on & mars!, words, of {truth} (and, also not,)'
98+
99+
assert_equal('popcorn anomoly: a cats! life. on & mars!, words, of {truth} (and, also not,)',
100+
PreprocessorPrimo.extract_keyword(input))
101+
end
102+
end

0 commit comments

Comments
 (0)