@@ -70,6 +70,9 @@ def start_flow():
70
70
logger .debug ("Clearing pdp_contacts to prepare for match" )
71
71
reset_pdp_contacts_with_unmatched (conn )
72
72
73
+ logger .debug ("Removing invalid entries from pdp_contacts" )
74
+ filter_invalid_pdp_data (conn )
75
+
73
76
logger .debug ("Computing automatic matches" )
74
77
automatic_matches = get_automatic_matches (conn )
75
78
logger .debug ("Computing manual matches" )
@@ -129,6 +132,71 @@ def compare_names(n1, n2):
129
132
return name_to_array (n1 ).bool_op ("&&" )(name_to_array (n2 ))
130
133
131
134
135
+ def filter_invalid_pdp_data (conn ):
136
+ pc = PdpContacts .__table__ .alias ()
137
+ lower_first_name = func .lower (pc .c .first_name )
138
+ lower_last_name = func .lower (pc .c .last_name )
139
+
140
+ unknown = and_ (
141
+ lower_first_name .ilike ("%unknown%" ),
142
+ lower_last_name .ilike ("%unknown%" )
143
+ )
144
+
145
+ question_mark = and_ (
146
+ lower_first_name == '?' ,
147
+ lower_last_name == '?'
148
+ )
149
+
150
+ john_or_jane_doe = and_ (
151
+ or_ (
152
+ lower_first_name == "john" ,
153
+ lower_first_name == "jane"
154
+ ),
155
+ lower_last_name == "doe"
156
+ )
157
+
158
+ no_name = and_ (
159
+ lower_first_name == "no" ,
160
+ lower_last_name == "name"
161
+ )
162
+
163
+ none_friends = and_ (
164
+ lower_first_name .is_ (None ),
165
+ lower_last_name == "friends"
166
+ )
167
+
168
+ red_flag = or_ (
169
+ lower_first_name == "(red flag)" ,
170
+ lower_last_name == "(red flag)"
171
+ )
172
+
173
+ # It would be preferable for the following two conditions to use sqlalchemy statements,
174
+ # but it proved surprisingly difficult to convert sqlalchemy regexp results into booleans
175
+ digits_only = and_ (
176
+ text ("""LOWER(first_name) ~ '^\d+$'""" ),
177
+ text ("""LOWER(last_name) ~ '^\d+$'""" )
178
+ )
179
+ no_name_no_name = and_ (
180
+ text ("""LOWER(first_name) ~ 'no\s?name'""" ),
181
+ text ("""LOWER(last_name) ~ 'no\s?name'""" )
182
+ )
183
+
184
+ composite_condition = or_ (
185
+ unknown ,
186
+ question_mark ,
187
+ john_or_jane_doe ,
188
+ no_name ,
189
+ none_friends ,
190
+ red_flag ,
191
+ digits_only ,
192
+ no_name_no_name ,
193
+ )
194
+
195
+ delete_stmt = delete (pc ).where (composite_condition )
196
+
197
+ return conn .execute (delete_stmt )
198
+
199
+
132
200
def get_automatic_matches (conn ):
133
201
pc1 = PdpContacts .__table__ .alias ()
134
202
pc2 = PdpContacts .__table__ .alias ()
0 commit comments