22# requires-python = ">=3.13"
33# dependencies = [
44# "marimo",
5+ # "pandas==2.3.3",
6+ # "pyarrow==21.0.0",
57# ]
68# ///
79
@@ -24,36 +26,249 @@ def _(mo):
2426 return
2527
2628
29+ @app .cell
30+ def _ ():
31+ # Functions
32+
33+ import math
34+ from pathlib import Path
35+
36+ import pandas as pd
37+
38+ def convert_size (size_bytes ):
39+ """Convert byte counts into a human readable format."""
40+ if size_bytes == 0 :
41+ return "0B"
42+ size_name = ("B" , "KB" , "MB" , "GB" , "TB" , "PB" , "EB" , "ZB" , "YB" )
43+ i = math .floor (math .log (size_bytes , 1024 ))
44+ p = math .pow (1024 , i )
45+ s = round (size_bytes / p , 2 )
46+ return f"{ s } { size_name [i ]} "
47+
48+ def parse_s3_keys (dataframe : pd .DataFrame ) -> pd .DataFrame :
49+ """Parse S3 keys to extract additional metadata and update dataframe."""
50+ key_parts = dataframe ["key" ].str .split ("/" , expand = True )
51+
52+ dataframe .loc [:, "bagname" ] = key_parts [8 ]
53+ uuid_parts = [
54+ key_parts [0 ] + key_parts [1 ],
55+ key_parts [2 ],
56+ key_parts [3 ],
57+ key_parts [4 ],
58+ key_parts [5 ] + key_parts [6 ] + key_parts [7 ],
59+ ]
60+ dataframe .loc [:, "uuid" ] = "-" .join (str (part ) for part in uuid_parts )
61+
62+ dataframe .loc [:, "file" ] = dataframe ["key" ].str .split ("/" ).str [- 1 ]
63+ dataframe .loc [:, "filepath" ] = (
64+ dataframe ["key" ].str .split ("/" ).str [9 :].apply ("/" .join )
65+ )
66+ dataframe .loc [:, "extension" ] = dataframe ["filepath" ].apply (
67+ lambda x : Path (x ).suffix .lower ()
68+ )
69+ return dataframe
70+
71+ def is_metadata (dataframe : pd .DataFrame ) -> pd .DataFrame :
72+ """Identifies metadata files in the DataFrame."""
73+ metadata_files = [
74+ "data/logs" ,
75+ "data/METS" ,
76+ "data/README.html" ,
77+ "data/objects/metadata" ,
78+ "data/objects/submissionDocumentation" ,
79+ "bag-info.txt" ,
80+ "bagit.txt" ,
81+ "manifest-sha256.txt" ,
82+ "tagmanifest-sha256.txt" ,
83+ ]
84+ dataframe .loc [:, "is_metadata" ] = dataframe ["key" ].apply (
85+ lambda x : any (metadata_file in x for metadata_file in metadata_files )
86+ )
87+
88+ return dataframe
89+
90+ return Path , convert_size , is_metadata , parse_s3_keys , pd
91+
92+
93+ @app .cell
94+ def _ (Path , is_metadata , parse_s3_keys , pd ):
95+ # Generate inventory dataframe
96+
97+ import os
98+
99+ parquet_files = Path (os .environ ["INVENTORY_LOCATIONS" ]).glob ("*.parquet" )
100+ inventory_df = (
101+ pd .concat ((pd .read_parquet (f ) for f in parquet_files ), ignore_index = True )
102+ .drop_duplicates ()
103+ .reset_index (drop = True )
104+ )
105+
106+ inventory_df .loc [:, "is_current" ] = (
107+ inventory_df ["is_latest" ] & ~ inventory_df ["is_delete_marker" ]
108+ )
109+
110+ inventory_df = parse_s3_keys (inventory_df )
111+ inventory_df = is_metadata (inventory_df )
112+ cdps_df = inventory_df .loc [inventory_df ["is_current" ]].copy ()
113+ return (cdps_df ,)
114+
115+
116+ @app .cell
117+ def _ (cdps_df , mo ):
118+ # Files
119+
120+ file_count = (
121+ cdps_df .groupby ("bucket" )
122+ .size ()
123+ .to_frame ("file count" )
124+ .sort_values (by = "file count" , ascending = False )
125+ )
126+ file_extensions = (
127+ cdps_df .groupby ("extension" )
128+ .size ()
129+ .to_frame ("file count" )
130+ .sort_values (by = "file count" , ascending = False )
131+ )
132+ file_storage = (
133+ cdps_df .sort_values (by = "size" , ascending = False )
134+ .loc [:, ["file" , "size" ]]
135+ .reset_index (drop = True )[:10 ]
136+ )
137+ file_metadata = (
138+ cdps_df .groupby ("is_metadata" )
139+ .size ()
140+ .rename (index = {False : "content files" , True : "metadata files" })
141+ .to_frame ("file count" )
142+ )
143+
144+ files_display = mo .vstack (
145+ [
146+ mo .md ("#### File count by bucket" ),
147+ file_count ,
148+ mo .md ("#### File count by extension" ),
149+ file_extensions ,
150+ mo .md ("#### Largest 10 files" ),
151+ file_storage ,
152+ mo .md ("#### Content vs metadata files" ),
153+ file_metadata ,
154+ ],
155+ gap = 1 ,
156+ )
157+ return (files_display ,)
158+
159+
27160@app .cell
28161def _ (mo ):
29- file_count = {"total" : "3" }
30- file_extensions = {"pdf" : "1" , "tiff" : "2" }
31- storage = {"total" : "1234" }
32-
33- data = mo .ui .dropdown (
34- options = {
35- "File - Count" : file_count ,
36- "File - Extensions" : file_extensions ,
37- "Storage" : storage ,
38- },
39- label = "Select a data type:" ,
162+ # Storage
163+
164+ storage = {"not implemented" : "not implemented" }
165+
166+ storage_display = mo .vstack (
167+ [storage ],
168+ gap = 1 ,
40169 )
41- data
42- return (data ,)
170+ return (storage_display ,)
43171
44172
45173@app .cell
46- def _ (data , mo ):
47- import json
174+ def _ (mo ):
175+ # AIPs
48176
49- if not data .selected_key :
50- markdown_str = ""
51- else :
52- markdown_str = f"""
53- ## { data .selected_key }
54- { json .dumps (data .value )}
55- """
56- mo .md (markdown_str )
177+ aips = {"not implemented" : "not implemented" }
178+
179+ aip_display = mo .vstack (
180+ [aips ],
181+ gap = 1 ,
182+ )
183+ return (aip_display ,)
184+
185+
186+ @app .cell
187+ def _ (mo ):
188+ # Digitized vs born-digital content
189+
190+ digitized_born_digital = {"not implemented" : "not implemented" }
191+
192+ digitized_born_digital_display = mo .vstack (
193+ [digitized_born_digital ],
194+ gap = 1 ,
195+ )
196+ return (digitized_born_digital_display ,)
197+
198+
199+ @app .cell
200+ def _ (mo ):
201+ # Image vs AV
202+
203+ image_av = {"not implemented" : "not implemented" }
204+
205+ image_av_display = mo .vstack (
206+ [image_av ],
207+ gap = 1 ,
208+ )
209+ return (image_av_display ,)
210+
211+
212+ @app .cell
213+ def _ (mo ):
214+ # Original vs duplicate files
215+
216+ original_duplicate = {"not implemented" : "not implemented" }
217+
218+ original_duplicate_display = mo .vstack (
219+ [original_duplicate ],
220+ gap = 1 ,
221+ )
222+ return (original_duplicate_display ,)
223+
224+
225+ @app .cell
226+ def _ (cdps_df , convert_size , mo ):
227+ # Summary stats
228+
229+ total_files = mo .stat (
230+ label = "Total files" ,
231+ value = f"{ len (cdps_df )} " ,
232+ )
233+
234+ total_storage = mo .stat (
235+ label = "Total storage" ,
236+ value = f"{ convert_size (cdps_df ["size" ].sum ())} " ,
237+ )
238+
239+ summary = mo .hstack ([total_files , total_storage ], widths = "equal" , gap = 1 )
240+ return (summary ,)
241+
242+
243+ @app .cell
244+ def _ (
245+ aip_display ,
246+ digitized_born_digital_display ,
247+ files_display ,
248+ image_av_display ,
249+ mo ,
250+ original_duplicate_display ,
251+ storage_display ,
252+ summary ,
253+ ):
254+ # Dashboard
255+
256+ accordion = mo .accordion (
257+ lazy = True ,
258+ items = {
259+ "Files" : files_display ,
260+ "Storage" : storage_display ,
261+ "AIPs" : aip_display ,
262+ "Digitized vs born-digital content" : digitized_born_digital_display ,
263+ "Image vs AV" : image_av_display ,
264+ "Original vs duplicate files" : original_duplicate_display ,
265+ },
266+ )
267+
268+ mo .vstack (
269+ ["Summary" , summary , accordion ],
270+ gap = 1 ,
271+ )
57272 return
58273
59274
0 commit comments