1+ import json
2+ import os
3+ import urllib .parse
4+
5+ import pymongo
6+ import requests
7+ from azure .cosmos import cosmos_client
8+ from dotenv import load_dotenv
9+
10+ load_dotenv ()
11+
12+ MONGO_CONNECTION_STRING = str (os .environ .get ("MONGO_CONNECTION_STRING" ))
13+ DB_NAME = "clearlydefined"
14+ COLLECTION_NAME = "definitions-trimmed"
15+ BASE_AZURE_BLOB_URL = str (os .environ .get ("BASE_AZURE_BLOB_URL" ))
16+
17+
18+ # Example coordinates: composer/packagist/00f100/fcphp-cache/revision/0.1.0.json
19+
20+ # Mongo document with unused fields removed
21+ # {
22+ # "_id": "composer/packagist/00f100/fcphp-cache/0.1.0",
23+ # "_meta": {
24+ # "schemaVersion": "1.6.1",
25+ # "updated": "2019-08-29T02:06:54.498Z"
26+ # },
27+ # "coordinates": {# "type": "composer",
28+ # "provider": "packagist",
29+ # "namespace": "00f100",
30+ # "name": "fcphp-cache",
31+ # "revision": "0.1.0"
32+ # },
33+ # "licensed": {
34+ # "declared": "MIT",# "toolScore": {
35+ # "total": 17,
36+ # "declared": 0,
37+ # "discovered": 2,
38+ # "consistency": 0,
39+ # "spdx": 0,
40+ # "texts": 15
41+ # },
42+ # "score": {
43+ # "total": 17,
44+ # "declared": 0,
45+ # "discovered": 2,
46+ # "consistency": 0,
47+ # "spdx": 0,
48+ # "texts": 15
49+ # }
50+ # }
51+ # }
52+
53+
54+ def fetch_blob (base_url , type , provider , namespace , name , revision ):
55+ """Fetch the blob from the azure blob storage"""
56+ # need to encode the url for the %2f characters
57+ url = urllib .parse .quote (
58+ f"{ type } /{ provider } /{ namespace } /{ name } /revision/{ revision } .json" .lower ()
59+ )
60+ url = f"{ base_url } /{ url } "
61+ # Fetch the data from the blob storage
62+ res = requests .get (url )
63+ if res .status_code != 200 :
64+ return {}
65+ return res .json ()
66+
67+
68+ def dump_data (data , filename ):
69+ with open (filename , "w" ) as f :
70+ json .dump (data , f )
71+
72+
73+ client = pymongo .MongoClient (MONGO_CONNECTION_STRING )
74+
75+ db = client [DB_NAME ]
76+ if DB_NAME not in client .list_database_names ():
77+ print (f"Database '{ DB_NAME } ' not found." )
78+ else :
79+ print (f"Using database: '{ DB_NAME } '." )
80+
81+ collection = db [COLLECTION_NAME ]
82+ if COLLECTION_NAME not in db .list_collection_names ():
83+ print (f"Collection '{ COLLECTION_NAME } ' not found." )
84+ else :
85+ print (f"Using collection: '{ COLLECTION_NAME } '." )
86+
87+
88+ months = ["2024-01" , "2024-02" , "2024-03" , "2024-04" , "2024-05" , "2024-06" ]
89+
90+ invalid_data = {}
91+
92+ for month in months :
93+ docs = collection .find (
94+ {
95+ "_meta.updated" : {"$gte" : f"{ month } -01" , "$lte" : f"{ month } -31" },
96+ "licensed.declared" : {"$exists" : False },
97+ },
98+ max_time_ms = 10000000 ,
99+ ).limit (5000 )
100+
101+ doc_count = collection .count_documents (
102+ {
103+ "_meta.updated" : {"$gte" : f"{ month } -01" , "$lte" : f"{ month } -31" },
104+ "licensed.declared" : {"$exists" : False },
105+ },
106+ max_time_ms = 10000000 ,
107+ )
108+
109+ invalid_data [month ] = {
110+ "stats" : {
111+ "sample_total" : 0 ,
112+ "sample_invalid" : 0 ,
113+ }
114+ }
115+ count = 0
116+
117+ for doc in docs :
118+ count += 1
119+ blob = fetch_blob (
120+ BASE_AZURE_BLOB_URL ,
121+ doc ["coordinates" ]["type" ],
122+ doc ["coordinates" ]["provider" ],
123+ doc ["coordinates" ].get ("namespace" , "-" ),
124+ doc ["coordinates" ]["name" ],
125+ doc ["coordinates" ]["revision" ],
126+ )
127+ db_licensed = doc .get ("licensed" , {})
128+ blob_licensed = blob .get ("licensed" , {})
129+
130+ if db_licensed .get ("declared" ) != blob_licensed .get ("declared" ):
131+ # only adding the licensed and meta fields to the invalid data
132+ invalid_data [month ][doc ["_id" ]] = {
133+ "db" : {
134+ "licensed" : (db_licensed .get ("declared" )),
135+ "_meta" : doc .get ("_meta" , {}),
136+ },
137+ "blob" : {
138+ "licensed" : (blob_licensed .get ("declared" )),
139+ "_meta" : blob .get ("_meta" , {}),
140+ },
141+ }
142+
143+ # Checkpoint in case mongo dies
144+ if count % 100 == 0 :
145+ print (
146+ f"Checkpoint: total number of invalid data: { len (invalid_data [month ])} , total items { count } ({ len (invalid_data [month ])/ count * 100 } %)"
147+ )
148+ invalid_data [month ]["stats" ]["sample_total" ] = count
149+ invalid_data [month ]["stats" ]["sample_invalid" ] = len (invalid_data [month ])
150+ dump_data (invalid_data , f"2024-invalid_data.json" )
151+
152+ invalid_data [month ]["stats" ]["total_documents" ] = doc_count
153+ invalid_data [month ]["stats" ]["total_estimated_invalid" ] = doc_count * (
154+ len (invalid_data [month ]) / count
155+ )
156+ invalid_data [month ]["stats" ]["sample_percent_of_total" ] = doc_count * (
157+ count / doc_count
158+ )
159+ dump_data (invalid_data , f"2024-invalid_data.json" )
160+ print ("Done" )
161+ print (
162+ f"Total number of invalid data: { len (invalid_data [month ])} , total items { count } ({ len (invalid_data [month ])/ count * 100 } %)"
163+ )
164+ dump_data (invalid_data , f"2024-invalid_data.json" )
0 commit comments