Skip to content

Commit 204744f

Browse files
authored
Merge pull request #619 from CodeForPhilly/588_update-matching-criteria
588: Delete invalid data from pdp_contacts
2 parents 4b9f866 + 18a7994 commit 204744f

File tree

1 file changed

+68
-0
lines changed

1 file changed

+68
-0
lines changed

src/server/pipeline/flow_script.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ def start_flow():
7070
logger.debug("Clearing pdp_contacts to prepare for match")
7171
reset_pdp_contacts_with_unmatched(conn)
7272

73+
logger.debug("Removing invalid entries from pdp_contacts")
74+
filter_invalid_pdp_data(conn)
75+
7376
logger.debug("Computing automatic matches")
7477
automatic_matches = get_automatic_matches(conn)
7578
logger.debug("Computing manual matches")
@@ -129,6 +132,71 @@ def compare_names(n1, n2):
129132
return name_to_array(n1).bool_op("&&")(name_to_array(n2))
130133

131134

135+
def filter_invalid_pdp_data(conn):
136+
pc = PdpContacts.__table__.alias()
137+
lower_first_name = func.lower(pc.c.first_name)
138+
lower_last_name = func.lower(pc.c.last_name)
139+
140+
unknown = and_(
141+
lower_first_name.ilike("%unknown%"),
142+
lower_last_name.ilike("%unknown%")
143+
)
144+
145+
question_mark = and_(
146+
lower_first_name == '?',
147+
lower_last_name == '?'
148+
)
149+
150+
john_or_jane_doe = and_(
151+
or_(
152+
lower_first_name == "john",
153+
lower_first_name == "jane"
154+
),
155+
lower_last_name == "doe"
156+
)
157+
158+
no_name = and_(
159+
lower_first_name == "no",
160+
lower_last_name == "name"
161+
)
162+
163+
none_friends = and_(
164+
lower_first_name.is_(None),
165+
lower_last_name == "friends"
166+
)
167+
168+
red_flag = or_(
169+
lower_first_name == "(red flag)",
170+
lower_last_name == "(red flag)"
171+
)
172+
173+
# It would be preferable for the following two conditions to use sqlalchemy statements,
174+
# but it proved surprisingly difficult to convert sqlalchemy regexp results into booleans
175+
digits_only = and_(
176+
text("""LOWER(first_name) ~ '^\d+$'"""),
177+
text("""LOWER(last_name) ~ '^\d+$'""")
178+
)
179+
no_name_no_name = and_(
180+
text("""LOWER(first_name) ~ 'no\s?name'"""),
181+
text("""LOWER(last_name) ~ 'no\s?name'""")
182+
)
183+
184+
composite_condition = or_(
185+
unknown,
186+
question_mark,
187+
john_or_jane_doe,
188+
no_name,
189+
none_friends,
190+
red_flag,
191+
digits_only,
192+
no_name_no_name,
193+
)
194+
195+
delete_stmt = delete(pc).where(composite_condition)
196+
197+
return conn.execute(delete_stmt)
198+
199+
132200
def get_automatic_matches(conn):
133201
pc1 = PdpContacts.__table__.alias()
134202
pc2 = PdpContacts.__table__.alias()

0 commit comments

Comments
 (0)