Skip to content

Commit 756f0c2

Browse files
committed
WIP
1 parent 520ae46 commit 756f0c2

File tree

2 files changed

+237
-24
lines changed

2 files changed

+237
-24
lines changed

notebook.py

Lines changed: 234 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# requires-python = ">=3.13"
33
# dependencies = [
44
# "marimo",
5+
# "pandas==2.3.3",
6+
# "pyarrow==21.0.0",
57
# ]
68
# ///
79

@@ -25,35 +27,243 @@ def _(mo):
2527

2628

2729
@app.cell
28-
def _(mo):
29-
file_count = {"total": "3"}
30-
file_extensions = {"pdf": "1", "tiff": "2"}
31-
storage = {"total": "1234"}
32-
33-
data = mo.ui.dropdown(
34-
options={
35-
"File - Count": file_count,
36-
"File - Extensions": file_extensions,
37-
"Storage": storage,
38-
},
39-
label="Select a data type:",
30+
def _():
31+
# Functions
32+
33+
import math
34+
from pathlib import Path
35+
36+
import pandas as pd
37+
38+
def convert_size(size_bytes):
39+
"""Convert byte counts into a human readable format."""
40+
if size_bytes == 0:
41+
return "0B"
42+
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
43+
i = math.floor(math.log(size_bytes, 1024))
44+
p = math.pow(1024, i)
45+
s = round(size_bytes / p, 2)
46+
return f"{s} {size_name[i]}"
47+
48+
def parse_s3_keys(dataframe: pd.DataFrame) -> pd.DataFrame:
49+
"""Parse S3 keys to extract additional metadata and update dataframe."""
50+
key_parts = dataframe["key"].str.split("/", expand=True)
51+
52+
dataframe.loc[:, "bagname"] = key_parts[8]
53+
uuid_parts = [
54+
key_parts[0] + key_parts[1],
55+
key_parts[2],
56+
key_parts[3],
57+
key_parts[4],
58+
key_parts[5] + key_parts[6] + key_parts[7],
59+
]
60+
dataframe.loc[:, "uuid"] = "-".join(str(part) for part in uuid_parts)
61+
62+
dataframe.loc[:, "file"] = dataframe["key"].str.split("/").str[-1]
63+
dataframe.loc[:, "filepath"] = (
64+
dataframe["key"].str.split("/").str[9:].apply("/".join)
65+
)
66+
dataframe.loc[:, "extension"] = dataframe["filepath"].apply(
67+
lambda x: Path(x).suffix.lower()
68+
)
69+
return dataframe
70+
71+
def is_metadata(dataframe: pd.DataFrame) -> pd.DataFrame:
72+
"""Identifies metadata files in the DataFrame."""
73+
metadata_files = [
74+
"data/logs",
75+
"data/METS",
76+
"data/README.html",
77+
"data/objects/metadata",
78+
"data/objects/submissionDocumentation",
79+
"bag-info.txt",
80+
"bagit.txt",
81+
"manifest-sha256.txt",
82+
"tagmanifest-sha256.txt",
83+
]
84+
dataframe.loc[:, "is_metadata"] = dataframe["key"].apply(
85+
lambda x: any(metadata_file in x for metadata_file in metadata_files)
86+
)
87+
88+
return dataframe
89+
90+
return Path, convert_size, is_metadata, parse_s3_keys, pd
91+
92+
93+
@app.cell
94+
def _(Path, is_metadata, parse_s3_keys, pd): # noqa: N803
95+
# Generate inventory dataframe
96+
97+
import os
98+
99+
parquet_files = Path(os.environ["INVENTORY_LOCATIONS"]).glob("*.parquet")
100+
inventory_df = (
101+
pd.concat((pd.read_parquet(f) for f in parquet_files), ignore_index=True)
102+
.drop_duplicates()
103+
.reset_index(drop=True)
40104
)
41-
data
42-
return (data,)
105+
106+
inventory_df.loc[:, "is_current"] = (
107+
inventory_df["is_latest"] & ~inventory_df["is_delete_marker"]
108+
)
109+
110+
inventory_df = parse_s3_keys(inventory_df)
111+
inventory_df = is_metadata(inventory_df)
112+
cdps_df = inventory_df.loc[inventory_df["is_current"]].copy()
113+
return (cdps_df,)
43114

44115

45116
@app.cell
46-
def _(data, mo):
47-
import json
117+
def _(cdps_df, convert_size, mo):
118+
# Summary stats
48119

49-
if not data.selected_key:
50-
markdown_str = ""
51-
else:
52-
markdown_str = f"""
53-
## {data.selected_key}
54-
{json.dumps(data.value)}
55-
"""
56-
mo.md(markdown_str)
120+
total_files = mo.stat(
121+
label="Total files",
122+
value=f"{len(cdps_df)}",
123+
)
124+
125+
total_storage = mo.stat(
126+
label="Total storage",
127+
value=f"{convert_size(cdps_df["size"].sum())}",
128+
)
129+
130+
summary = mo.hstack([total_files, total_storage], widths="equal", gap=1)
131+
return (summary,)
132+
133+
134+
@app.cell
135+
def _(cdps_df):
136+
# Data filters
137+
138+
file_count = (
139+
cdps_df.groupby("bucket")
140+
.size()
141+
.to_frame("file count")
142+
.sort_values(by="file count", ascending=False)
143+
)
144+
file_extensions = (
145+
cdps_df.groupby("extension")
146+
.size()
147+
.to_frame("file count")
148+
.sort_values(by="file count", ascending=False)
149+
)
150+
file_storage = (
151+
cdps_df.sort_values(by="size", ascending=False)
152+
.loc[:, ["file", "size"]]
153+
.reset_index(drop=True)[:10]
154+
)
155+
file_metadata = (
156+
cdps_df.groupby("is_metadata")
157+
.size()
158+
.rename(index={False: "content files", True: "metadata files"})
159+
.to_frame("file count")
160+
)
161+
storage = {"not implemented": "not implemented"}
162+
aips = {"not implemented": "not implemented"}
163+
digitized_born_digital = {"not implemented": "not implemented"}
164+
image_av = {"not implemented": "not implemented"}
165+
original_duplicate = {"not implemented": "not implemented"}
166+
return (
167+
aips,
168+
digitized_born_digital,
169+
file_count,
170+
file_extensions,
171+
file_metadata,
172+
file_storage,
173+
image_av,
174+
original_duplicate,
175+
storage,
176+
)
177+
178+
179+
@app.cell
180+
def _(
181+
aips,
182+
digitized_born_digital,
183+
file_count,
184+
file_extensions,
185+
file_metadata,
186+
file_storage,
187+
image_av,
188+
mo,
189+
original_duplicate,
190+
storage,
191+
):
192+
# Data displays
193+
194+
files_display = mo.vstack(
195+
[
196+
mo.md("#### File count by bucket"),
197+
file_count,
198+
mo.md("#### File count by extension"),
199+
file_extensions,
200+
mo.md("#### Largest 10 files"),
201+
file_storage,
202+
mo.md("#### Content vs metadata files"),
203+
file_metadata,
204+
],
205+
gap=1,
206+
)
207+
208+
storage_display = mo.vstack(
209+
[storage],
210+
gap=1,
211+
)
212+
aip_display = mo.vstack(
213+
[aips],
214+
gap=1,
215+
)
216+
digitized_born_digital_display = mo.vstack(
217+
[digitized_born_digital],
218+
gap=1,
219+
)
220+
image_av_display = mo.vstack(
221+
[image_av],
222+
gap=1,
223+
)
224+
original_duplicate_display = mo.vstack(
225+
[original_duplicate],
226+
gap=1,
227+
)
228+
return (
229+
aip_display,
230+
digitized_born_digital_display,
231+
files_display,
232+
image_av_display,
233+
original_duplicate_display,
234+
storage_display,
235+
)
236+
237+
238+
@app.cell
239+
def _(
240+
aip_display,
241+
digitized_born_digital_display,
242+
files_display,
243+
image_av_display,
244+
mo,
245+
original_duplicate_display,
246+
storage_display,
247+
summary,
248+
):
249+
# Dashboard
250+
251+
accordion = mo.accordion(
252+
lazy=True,
253+
items={
254+
"Files": files_display,
255+
"Storage": storage_display,
256+
"AIPs": aip_display,
257+
"Digitized vs born-digital content": digitized_born_digital_display,
258+
"Image vs AV": image_av_display,
259+
"Original vs duplicate files": original_duplicate_display,
260+
},
261+
)
262+
263+
mo.vstack(
264+
["Summary", summary, accordion],
265+
gap=1,
266+
)
57267
return
58268

59269

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ dependencies = [
77
"marimo>=0.14.17",
88
]
99

10+
[tool.marimo.runtime]
11+
dotenv = [".env"]
12+
1013
[dependency-groups]
1114
dev = [
1215
"black>=25.1.0",

0 commit comments

Comments
 (0)