22# requires-python = ">=3.13"
33# dependencies = [
44# "marimo",
5+ # "pandas==2.3.3",
6+ # "pyarrow==21.0.0",
57# ]
68# ///
79
@@ -25,35 +27,243 @@ def _(mo):
2527
2628
2729@app .cell
28- def _ (mo ):
29- file_count = {"total" : "3" }
30- file_extensions = {"pdf" : "1" , "tiff" : "2" }
31- storage = {"total" : "1234" }
32-
33- data = mo .ui .dropdown (
34- options = {
35- "File - Count" : file_count ,
36- "File - Extensions" : file_extensions ,
37- "Storage" : storage ,
38- },
39- label = "Select a data type:" ,
30+ def _ ():
31+ # Functions
32+
33+ import math
34+ from pathlib import Path
35+
36+ import pandas as pd
37+
38+ def convert_size (size_bytes ):
39+ """Convert byte counts into a human readable format."""
40+ if size_bytes == 0 :
41+ return "0B"
42+ size_name = ("B" , "KB" , "MB" , "GB" , "TB" , "PB" , "EB" , "ZB" , "YB" )
43+ i = math .floor (math .log (size_bytes , 1024 ))
44+ p = math .pow (1024 , i )
45+ s = round (size_bytes / p , 2 )
46+ return f"{ s } { size_name [i ]} "
47+
48+ def parse_s3_keys (dataframe : pd .DataFrame ) -> pd .DataFrame :
49+ """Parse S3 keys to extract additional metadata and update dataframe."""
50+ key_parts = dataframe ["key" ].str .split ("/" , expand = True )
51+
52+ dataframe .loc [:, "bagname" ] = key_parts [8 ]
53+ uuid_parts = [
54+ key_parts [0 ] + key_parts [1 ],
55+ key_parts [2 ],
56+ key_parts [3 ],
57+ key_parts [4 ],
58+ key_parts [5 ] + key_parts [6 ] + key_parts [7 ],
59+ ]
60+ dataframe .loc [:, "uuid" ] = "-" .join (str (part ) for part in uuid_parts )
61+
62+ dataframe .loc [:, "file" ] = dataframe ["key" ].str .split ("/" ).str [- 1 ]
63+ dataframe .loc [:, "filepath" ] = (
64+ dataframe ["key" ].str .split ("/" ).str [9 :].apply ("/" .join )
65+ )
66+ dataframe .loc [:, "extension" ] = dataframe ["filepath" ].apply (
67+ lambda x : Path (x ).suffix .lower ()
68+ )
69+ return dataframe
70+
71+ def is_metadata (dataframe : pd .DataFrame ) -> pd .DataFrame :
72+ """Identifies metadata files in the DataFrame."""
73+ metadata_files = [
74+ "data/logs" ,
75+ "data/METS" ,
76+ "data/README.html" ,
77+ "data/objects/metadata" ,
78+ "data/objects/submissionDocumentation" ,
79+ "bag-info.txt" ,
80+ "bagit.txt" ,
81+ "manifest-sha256.txt" ,
82+ "tagmanifest-sha256.txt" ,
83+ ]
84+ dataframe .loc [:, "is_metadata" ] = dataframe ["key" ].apply (
85+ lambda x : any (metadata_file in x for metadata_file in metadata_files )
86+ )
87+
88+ return dataframe
89+
90+ return Path , convert_size , is_metadata , parse_s3_keys , pd
91+
92+
93+ @app .cell
94+ def _ (Path , is_metadata , parse_s3_keys , pd ): # noqa: N803
95+ # Generate inventory dataframe
96+
97+ import os
98+
99+ parquet_files = Path (os .environ ["INVENTORY_LOCATIONS" ]).glob ("*.parquet" )
100+ inventory_df = (
101+ pd .concat ((pd .read_parquet (f ) for f in parquet_files ), ignore_index = True )
102+ .drop_duplicates ()
103+ .reset_index (drop = True )
40104 )
41- data
42- return (data ,)
105+
106+ inventory_df .loc [:, "is_current" ] = (
107+ inventory_df ["is_latest" ] & ~ inventory_df ["is_delete_marker" ]
108+ )
109+
110+ inventory_df = parse_s3_keys (inventory_df )
111+ inventory_df = is_metadata (inventory_df )
112+ cdps_df = inventory_df .loc [inventory_df ["is_current" ]].copy ()
113+ return (cdps_df ,)
43114
44115
45116@app .cell
46- def _ (data , mo ):
47- import json
117+ def _ (cdps_df , convert_size , mo ):
118+ # Summary stats
48119
49- if not data .selected_key :
50- markdown_str = ""
51- else :
52- markdown_str = f"""
53- ## { data .selected_key }
54- { json .dumps (data .value )}
55- """
56- mo .md (markdown_str )
120+ total_files = mo .stat (
121+ label = "Total files" ,
122+ value = f"{ len (cdps_df )} " ,
123+ )
124+
125+ total_storage = mo .stat (
126+ label = "Total storage" ,
127+ value = f"{ convert_size (cdps_df ["size" ].sum ())} " ,
128+ )
129+
130+ summary = mo .hstack ([total_files , total_storage ], widths = "equal" , gap = 1 )
131+ return (summary ,)
132+
133+
134+ @app .cell
135+ def _ (cdps_df ):
136+ # Data filters
137+
138+ file_count = (
139+ cdps_df .groupby ("bucket" )
140+ .size ()
141+ .to_frame ("file count" )
142+ .sort_values (by = "file count" , ascending = False )
143+ )
144+ file_extensions = (
145+ cdps_df .groupby ("extension" )
146+ .size ()
147+ .to_frame ("file count" )
148+ .sort_values (by = "file count" , ascending = False )
149+ )
150+ file_storage = (
151+ cdps_df .sort_values (by = "size" , ascending = False )
152+ .loc [:, ["file" , "size" ]]
153+ .reset_index (drop = True )[:10 ]
154+ )
155+ file_metadata = (
156+ cdps_df .groupby ("is_metadata" )
157+ .size ()
158+ .rename (index = {False : "content files" , True : "metadata files" })
159+ .to_frame ("file count" )
160+ )
161+ storage = {"not implemented" : "not implemented" }
162+ aips = {"not implemented" : "not implemented" }
163+ digitized_born_digital = {"not implemented" : "not implemented" }
164+ image_av = {"not implemented" : "not implemented" }
165+ original_duplicate = {"not implemented" : "not implemented" }
166+ return (
167+ aips ,
168+ digitized_born_digital ,
169+ file_count ,
170+ file_extensions ,
171+ file_metadata ,
172+ file_storage ,
173+ image_av ,
174+ original_duplicate ,
175+ storage ,
176+ )
177+
178+
179+ @app .cell
180+ def _ (
181+ aips ,
182+ digitized_born_digital ,
183+ file_count ,
184+ file_extensions ,
185+ file_metadata ,
186+ file_storage ,
187+ image_av ,
188+ mo ,
189+ original_duplicate ,
190+ storage ,
191+ ):
192+ # Data displays
193+
194+ files_display = mo .vstack (
195+ [
196+ mo .md ("#### File count by bucket" ),
197+ file_count ,
198+ mo .md ("#### File count by extension" ),
199+ file_extensions ,
200+ mo .md ("#### Largest 10 files" ),
201+ file_storage ,
202+ mo .md ("#### Content vs metadata files" ),
203+ file_metadata ,
204+ ],
205+ gap = 1 ,
206+ )
207+
208+ storage_display = mo .vstack (
209+ [storage ],
210+ gap = 1 ,
211+ )
212+ aip_display = mo .vstack (
213+ [aips ],
214+ gap = 1 ,
215+ )
216+ digitized_born_digital_display = mo .vstack (
217+ [digitized_born_digital ],
218+ gap = 1 ,
219+ )
220+ image_av_display = mo .vstack (
221+ [image_av ],
222+ gap = 1 ,
223+ )
224+ original_duplicate_display = mo .vstack (
225+ [original_duplicate ],
226+ gap = 1 ,
227+ )
228+ return (
229+ aip_display ,
230+ digitized_born_digital_display ,
231+ files_display ,
232+ image_av_display ,
233+ original_duplicate_display ,
234+ storage_display ,
235+ )
236+
237+
238+ @app .cell
239+ def _ (
240+ aip_display ,
241+ digitized_born_digital_display ,
242+ files_display ,
243+ image_av_display ,
244+ mo ,
245+ original_duplicate_display ,
246+ storage_display ,
247+ summary ,
248+ ):
249+ # Dashboard
250+
251+ accordion = mo .accordion (
252+ lazy = True ,
253+ items = {
254+ "Files" : files_display ,
255+ "Storage" : storage_display ,
256+ "AIPs" : aip_display ,
257+ "Digitized vs born-digital content" : digitized_born_digital_display ,
258+ "Image vs AV" : image_av_display ,
259+ "Original vs duplicate files" : original_duplicate_display ,
260+ },
261+ )
262+
263+ mo .vstack (
264+ ["Summary" , summary , accordion ],
265+ gap = 1 ,
266+ )
57267 return
58268
59269
0 commit comments