Skip to content

Commit 597842a

Browse files
committed
add script to check if DB and production-definitions have the same data
This performs a check one month at a time hardcoded for all months in 2024. Output file is hardcoded to "2024-invalid_data.json”.
1 parent 885cf23 commit 597842a

File tree

2 files changed

+176
-0
lines changed

2 files changed

+176
-0
lines changed
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import json
2+
import os
3+
import urllib.parse
4+
5+
import pymongo
6+
import requests
7+
from azure.cosmos import cosmos_client
8+
from dotenv import load_dotenv
9+
10+
load_dotenv()
11+
12+
MONGO_CONNECTION_STRING = str(os.environ.get("MONGO_CONNECTION_STRING"))
13+
DB_NAME = "clearlydefined"
14+
COLLECTION_NAME = "definitions-trimmed"
15+
BASE_AZURE_BLOB_URL = str(os.environ.get("BASE_AZURE_BLOB_URL"))
16+
17+
18+
# Example coordinates: composer/packagist/00f100/fcphp-cache/revision/0.1.0.json
19+
20+
# Mongo document with unused fields removed
21+
# {
22+
# "_id": "composer/packagist/00f100/fcphp-cache/0.1.0",
23+
# "_meta": {
24+
# "schemaVersion": "1.6.1",
25+
# "updated": "2019-08-29T02:06:54.498Z"
26+
# },
27+
# "coordinates": {# "type": "composer",
28+
# "provider": "packagist",
29+
# "namespace": "00f100",
30+
# "name": "fcphp-cache",
31+
# "revision": "0.1.0"
32+
# },
33+
# "licensed": {
34+
# "declared": "MIT",# "toolScore": {
35+
# "total": 17,
36+
# "declared": 0,
37+
# "discovered": 2,
38+
# "consistency": 0,
39+
# "spdx": 0,
40+
# "texts": 15
41+
# },
42+
# "score": {
43+
# "total": 17,
44+
# "declared": 0,
45+
# "discovered": 2,
46+
# "consistency": 0,
47+
# "spdx": 0,
48+
# "texts": 15
49+
# }
50+
# }
51+
# }
52+
53+
54+
def fetch_blob(base_url, type, provider, namespace, name, revision):
55+
"""Fetch the blob from the azure blob storage"""
56+
# need to encode the url for the %2f characters
57+
url = urllib.parse.quote(
58+
f"{type}/{provider}/{namespace}/{name}/revision/{revision}.json".lower()
59+
)
60+
url = f"{base_url}/{url}"
61+
# Fetch the data from the blob storage
62+
res = requests.get(url)
63+
if res.status_code != 200:
64+
return {}
65+
return res.json()
66+
67+
68+
def dump_data(data, filename):
69+
with open(filename, "w") as f:
70+
json.dump(data, f)
71+
72+
73+
client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
74+
75+
db = client[DB_NAME]
76+
if DB_NAME not in client.list_database_names():
77+
print(f"Database '{DB_NAME}' not found.")
78+
else:
79+
print(f"Using database: '{DB_NAME}'.")
80+
81+
collection = db[COLLECTION_NAME]
82+
if COLLECTION_NAME not in db.list_collection_names():
83+
print(f"Collection '{COLLECTION_NAME}' not found.")
84+
else:
85+
print(f"Using collection: '{COLLECTION_NAME}'.")
86+
87+
88+
months = ["2024-01", "2024-02", "2024-03", "2024-04", "2024-05", "2024-06"]
89+
90+
invalid_data = {}
91+
92+
for month in months:
93+
docs = collection.find(
94+
{
95+
"_meta.updated": {"$gte": f"{month}-01", "$lte": f"{month}-31"},
96+
"licensed.declared": {"$exists": False},
97+
},
98+
max_time_ms=10000000,
99+
).limit(5000)
100+
101+
doc_count = collection.count_documents(
102+
{
103+
"_meta.updated": {"$gte": f"{month}-01", "$lte": f"{month}-31"},
104+
"licensed.declared": {"$exists": False},
105+
},
106+
max_time_ms=10000000,
107+
)
108+
109+
invalid_data[month] = {
110+
"stats": {
111+
"sample_total": 0,
112+
"sample_invalid": 0,
113+
}
114+
}
115+
count = 0
116+
117+
for doc in docs:
118+
count += 1
119+
blob = fetch_blob(
120+
BASE_AZURE_BLOB_URL,
121+
doc["coordinates"]["type"],
122+
doc["coordinates"]["provider"],
123+
doc["coordinates"].get("namespace", "-"),
124+
doc["coordinates"]["name"],
125+
doc["coordinates"]["revision"],
126+
)
127+
db_licensed = doc.get("licensed", {})
128+
blob_licensed = blob.get("licensed", {})
129+
130+
if db_licensed.get("declared") != blob_licensed.get("declared"):
131+
# only adding the licensed and meta fields to the invalid data
132+
invalid_data[month][doc["_id"]] = {
133+
"db": {
134+
"licensed": (db_licensed.get("declared")),
135+
"_meta": doc.get("_meta", {}),
136+
},
137+
"blob": {
138+
"licensed": (blob_licensed.get("declared")),
139+
"_meta": blob.get("_meta", {}),
140+
},
141+
}
142+
143+
# Checkpoint in case mongo dies
144+
if count % 100 == 0:
145+
print(
146+
f"Checkpoint: total number of invalid data: {len(invalid_data[month])}, total items {count} ({len(invalid_data[month])/count * 100}%)"
147+
)
148+
invalid_data[month]["stats"]["sample_total"] = count
149+
invalid_data[month]["stats"]["sample_invalid"] = len(invalid_data[month])
150+
dump_data(invalid_data, f"2024-invalid_data.json")
151+
152+
invalid_data[month]["stats"]["total_documents"] = doc_count
153+
invalid_data[month]["stats"]["total_estimated_invalid"] = doc_count * (
154+
len(invalid_data[month]) / count
155+
)
156+
invalid_data[month]["stats"]["sample_percent_of_total"] = doc_count * (
157+
count / doc_count
158+
)
159+
dump_data(invalid_data, f"2024-invalid_data.json")
160+
print("Done")
161+
print(
162+
f"Total number of invalid data: {len(invalid_data[month])}, total items {count} ({len(invalid_data[month])/count * 100}%)"
163+
)
164+
dump_data(invalid_data, f"2024-invalid_data.json")
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
azure-core==1.30.2
2+
azure-cosmos==4.7.0
3+
certifi==2024.6.2
4+
charset-normalizer==3.3.2
5+
dnspython==2.6.1
6+
idna==3.7
7+
pymongo==4.7.3
8+
python-dotenv==1.0.1
9+
requests==2.32.3
10+
six==1.16.0
11+
typing_extensions==4.12.2
12+
urllib3==2.2.1

0 commit comments

Comments
 (0)