Skip to content

Commit 28d8da8

Browse files
committed
WIP
1 parent 520ae46 commit 28d8da8

File tree

3 files changed

+246
-24
lines changed

3 files changed

+246
-24
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,4 +208,7 @@ __marimo__/
208208

209209
AGENTS.md
210210

211-
.idea
211+
.idea
212+
213+
parquet_data/
214+
*.csv

notebook.py

Lines changed: 238 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# requires-python = ">=3.13"
33
# dependencies = [
44
# "marimo",
5+
# "pandas==2.3.3",
6+
# "pyarrow==21.0.0",
57
# ]
68
# ///
79

@@ -24,36 +26,249 @@ def _(mo):
2426
return
2527

2628

29+
@app.cell
30+
def _():
31+
# Functions
32+
33+
import math
34+
from pathlib import Path
35+
36+
import pandas as pd
37+
38+
def convert_size(size_bytes):
39+
"""Convert byte counts into a human readable format."""
40+
if size_bytes == 0:
41+
return "0B"
42+
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
43+
i = math.floor(math.log(size_bytes, 1024))
44+
p = math.pow(1024, i)
45+
s = round(size_bytes / p, 2)
46+
return f"{s} {size_name[i]}"
47+
48+
def parse_s3_keys(dataframe: pd.DataFrame) -> pd.DataFrame:
49+
"""Parse S3 keys to extract additional metadata and update dataframe."""
50+
key_parts = dataframe["key"].str.split("/", expand=True)
51+
52+
dataframe.loc[:, "bagname"] = key_parts[8]
53+
uuid_parts = [
54+
key_parts[0] + key_parts[1],
55+
key_parts[2],
56+
key_parts[3],
57+
key_parts[4],
58+
key_parts[5] + key_parts[6] + key_parts[7],
59+
]
60+
dataframe.loc[:, "uuid"] = "-".join(str(part) for part in uuid_parts)
61+
62+
dataframe.loc[:, "file"] = dataframe["key"].str.split("/").str[-1]
63+
dataframe.loc[:, "filepath"] = (
64+
dataframe["key"].str.split("/").str[9:].apply("/".join)
65+
)
66+
dataframe.loc[:, "extension"] = dataframe["filepath"].apply(
67+
lambda x: Path(x).suffix.lower()
68+
)
69+
return dataframe
70+
71+
def is_metadata(dataframe: pd.DataFrame) -> pd.DataFrame:
72+
"""Identifies metadata files in the DataFrame."""
73+
metadata_files = [
74+
"data/logs",
75+
"data/METS",
76+
"data/README.html",
77+
"data/objects/metadata",
78+
"data/objects/submissionDocumentation",
79+
"bag-info.txt",
80+
"bagit.txt",
81+
"manifest-sha256.txt",
82+
"tagmanifest-sha256.txt",
83+
]
84+
dataframe.loc[:, "is_metadata"] = dataframe["key"].apply(
85+
lambda x: any(metadata_file in x for metadata_file in metadata_files)
86+
)
87+
88+
return dataframe
89+
90+
return Path, convert_size, is_metadata, parse_s3_keys, pd
91+
92+
93+
@app.cell
94+
def _(Path, is_metadata, parse_s3_keys, pd):
95+
# Generate inventory dataframe
96+
97+
import os
98+
99+
parquet_files = Path(os.environ["INVENTORY_LOCATIONS"]).glob("*.parquet")
100+
inventory_df = (
101+
pd.concat((pd.read_parquet(f) for f in parquet_files), ignore_index=True)
102+
.drop_duplicates()
103+
.reset_index(drop=True)
104+
)
105+
106+
inventory_df.loc[:, "is_current"] = (
107+
inventory_df["is_latest"] & ~inventory_df["is_delete_marker"]
108+
)
109+
110+
inventory_df = parse_s3_keys(inventory_df)
111+
inventory_df = is_metadata(inventory_df)
112+
cdps_df = inventory_df.loc[inventory_df["is_current"]].copy()
113+
return (cdps_df,)
114+
115+
116+
@app.cell
117+
def _(cdps_df, mo):
118+
# Files
119+
120+
file_count = (
121+
cdps_df.groupby("bucket")
122+
.size()
123+
.to_frame("file count")
124+
.sort_values(by="file count", ascending=False)
125+
)
126+
file_extensions = (
127+
cdps_df.groupby("extension")
128+
.size()
129+
.to_frame("file count")
130+
.sort_values(by="file count", ascending=False)
131+
)
132+
file_storage = (
133+
cdps_df.sort_values(by="size", ascending=False)
134+
.loc[:, ["file", "size"]]
135+
.reset_index(drop=True)[:10]
136+
)
137+
file_metadata = (
138+
cdps_df.groupby("is_metadata")
139+
.size()
140+
.rename(index={False: "content files", True: "metadata files"})
141+
.to_frame("file count")
142+
)
143+
144+
files_display = mo.vstack(
145+
[
146+
mo.md("#### File count by bucket"),
147+
file_count,
148+
mo.md("#### File count by extension"),
149+
file_extensions,
150+
mo.md("#### Largest 10 files"),
151+
file_storage,
152+
mo.md("#### Content vs metadata files"),
153+
file_metadata,
154+
],
155+
gap=1,
156+
)
157+
return (files_display,)
158+
159+
27160
@app.cell
28161
def _(mo):
29-
file_count = {"total": "3"}
30-
file_extensions = {"pdf": "1", "tiff": "2"}
31-
storage = {"total": "1234"}
32-
33-
data = mo.ui.dropdown(
34-
options={
35-
"File - Count": file_count,
36-
"File - Extensions": file_extensions,
37-
"Storage": storage,
38-
},
39-
label="Select a data type:",
162+
# Storage
163+
164+
storage = {"not implemented": "not implemented"}
165+
166+
storage_display = mo.vstack(
167+
[storage],
168+
gap=1,
40169
)
41-
data
42-
return (data,)
170+
return (storage_display,)
43171

44172

45173
@app.cell
46-
def _(data, mo):
47-
import json
174+
def _(mo):
175+
# AIPs
48176

49-
if not data.selected_key:
50-
markdown_str = ""
51-
else:
52-
markdown_str = f"""
53-
## {data.selected_key}
54-
{json.dumps(data.value)}
55-
"""
56-
mo.md(markdown_str)
177+
aips = {"not implemented": "not implemented"}
178+
179+
aip_display = mo.vstack(
180+
[aips],
181+
gap=1,
182+
)
183+
return (aip_display,)
184+
185+
186+
@app.cell
187+
def _(mo):
188+
# Digitized vs born-digital content
189+
190+
digitized_born_digital = {"not implemented": "not implemented"}
191+
192+
digitized_born_digital_display = mo.vstack(
193+
[digitized_born_digital],
194+
gap=1,
195+
)
196+
return (digitized_born_digital_display,)
197+
198+
199+
@app.cell
200+
def _(mo):
201+
# Image vs AV
202+
203+
image_av = {"not implemented": "not implemented"}
204+
205+
image_av_display = mo.vstack(
206+
[image_av],
207+
gap=1,
208+
)
209+
return (image_av_display,)
210+
211+
212+
@app.cell
213+
def _(mo):
214+
# Original vs duplicate files
215+
216+
original_duplicate = {"not implemented": "not implemented"}
217+
218+
original_duplicate_display = mo.vstack(
219+
[original_duplicate],
220+
gap=1,
221+
)
222+
return (original_duplicate_display,)
223+
224+
225+
@app.cell
226+
def _(cdps_df, convert_size, mo):
227+
# Summary stats
228+
229+
total_files = mo.stat(
230+
label="Total files",
231+
value=f"{len(cdps_df)}",
232+
)
233+
234+
total_storage = mo.stat(
235+
label="Total storage",
236+
value=f"{convert_size(cdps_df["size"].sum())}",
237+
)
238+
239+
summary = mo.hstack([total_files, total_storage], widths="equal", gap=1)
240+
return (summary,)
241+
242+
243+
@app.cell
244+
def _(
245+
aip_display,
246+
digitized_born_digital_display,
247+
files_display,
248+
image_av_display,
249+
mo,
250+
original_duplicate_display,
251+
storage_display,
252+
summary,
253+
):
254+
# Dashboard
255+
256+
accordion = mo.accordion(
257+
lazy=True,
258+
items={
259+
"Files": files_display,
260+
"Storage": storage_display,
261+
"AIPs": aip_display,
262+
"Digitized vs born-digital content": digitized_born_digital_display,
263+
"Image vs AV": image_av_display,
264+
"Original vs duplicate files": original_duplicate_display,
265+
},
266+
)
267+
268+
mo.vstack(
269+
["Summary", summary, accordion],
270+
gap=1,
271+
)
57272
return
58273

59274

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ dependencies = [
77
"marimo>=0.14.17",
88
]
99

10+
[tool.marimo.runtime]
11+
dotenv = [".env"]
12+
1013
[dependency-groups]
1114
dev = [
1215
"black>=25.1.0",
@@ -64,6 +67,7 @@ ignore = [
6467
"EM102",
6568
"FIX002",
6669
"G004",
70+
"N803",
6771
"N812",
6872
"PLR0912",
6973
"PLR0913",

0 commit comments

Comments
 (0)