-
-
Notifications
You must be signed in to change notification settings - Fork 133
1301 nd neutral citation in the html #1477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
cf246c8
ebe178a
da04fd1
4fb7e64
9331199
a405db9
332b73b
cce7716
60d8472
29559b0
2b3e4fe
e54957e
30b2a71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
# Author: Phil Ardery | ||
# Contact: https://www.ndcourts.gov/contact-us | ||
# History: | ||
# Date created: 2019-02-28 | ||
# Updated: 2024-05-08, grossir: to OpinionSiteLinear and new URL | ||
# Updated: 2025-07-02, luism: get citation from HTML | ||
|
||
import re | ||
from datetime import date, datetime | ||
from urllib.parse import urljoin | ||
|
@@ -15,6 +18,7 @@ | |
class Site(OpinionSiteLinear): | ||
base_url = "https://www.ndcourts.gov/" | ||
ordered_fields = [ | ||
"citation", | ||
"name", | ||
"docket", | ||
"date", | ||
|
@@ -50,11 +54,30 @@ def _process_html(self) -> None: | |
|
||
for idx, txt in enumerate(raw_values[:5]): | ||
if idx == 0: | ||
txt, _ = self.clean_name(txt) | ||
# Separate case name and citation if present | ||
match = re.match( | ||
r"^(.*?)(\s*((\d{4}\sND\s\d+)|(1 \d\.N\.W\d d\d+)))?\s*$", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whats with the 1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont see any way to actually identify the nw2d and nw3d citations but the regex here looks wrong to me. |
||
txt, | ||
) | ||
if match: | ||
case_name = match.group(1).strip() | ||
# If matched with the second regex (1 \d\.N\.W\d d\d+), set citation to "" | ||
if match.group(5): | ||
citation = "" | ||
else: | ||
citation = ( | ||
match.group(2).strip() | ||
if match.group(2) | ||
else "" | ||
) | ||
txt = case_name | ||
else: | ||
citation = "" | ||
txt = self.clean_name(txt) | ||
values.append(citation) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is getting a bit unwieldy. I think the few examples of nw3d with broken citations should be ignored. focus solely on the ND format and ignore the rest. It's not worth the hassle in my opinion for bad data. |
||
else: | ||
txt = txt.split(":", 1)[1].strip() | ||
values.append(txt) | ||
|
||
summary = ( | ||
" ".join(raw_values[5:]).strip() if len(raw_values) > 5 else "" | ||
) | ||
|
@@ -72,7 +95,7 @@ def _process_html(self) -> None: | |
continue | ||
seen_urls.add(url) | ||
|
||
case = dict(zip(self.ordered_fields, values[:5])) | ||
case = dict(zip(self.ordered_fields, values[:6])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is the culprit, you assume 6 items in the list but you only have 5 in one path There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have 5 raw values, but we added citation so we need first 6 values |
||
case.update({"summary": summary, "url": url, "per_curiam": False}) | ||
|
||
if "per curiam" in case["judge"].lower(): | ||
|
@@ -81,7 +104,7 @@ def _process_html(self) -> None: | |
|
||
self.cases.append(case) | ||
|
||
def clean_name(self, name: str) -> tuple[str, str]: | ||
def clean_name(self, name: str) -> str: | ||
"""Cleans case name | ||
|
||
Some case names list the consolidated docket or a | ||
|
@@ -90,14 +113,12 @@ def clean_name(self, name: str) -> tuple[str, str]: | |
:param name: raw case name | ||
:return: cleaned name and extra_docket numbers | ||
""" | ||
other_dockets = "" | ||
if "(consolidated w/" in name: | ||
other_dockets = ",".join(re.findall(r"\d{8}", name)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you updated the function but didnt update the docstring. it still says its returning extra docket numbers. But shouldnt those extra docket numbers be included in the final docket numbers? Can you create a second example html that captures this edge cases in the html so I can see how it is processed |
||
name = name.split("(consolidated w/")[0] | ||
if "(CONFIDENTIAL" in name: | ||
name = name.split("(CONFIDENTIAL")[0] | ||
|
||
return name.strip(), other_dockets | ||
return name.strip() | ||
|
||
def extract_from_text(self, scraped_text: str) -> dict: | ||
"""Extract model fields from opinion's document text | ||
|
Uh oh!
There was an error while loading. Please reload this page.