Skip to content

Commit 4e827f6

Browse files
Add parser updates and chunking logic
1 parent 78a1589 commit 4e827f6

File tree

8 files changed

+19389
-0
lines changed

8 files changed

+19389
-0
lines changed

parser/chapter1_chunks.json

Lines changed: 2801 additions & 0 deletions
Large diffs are not rendered by default.

parser/chapter2_chunks.json

Lines changed: 1352 additions & 0 deletions
Large diffs are not rendered by default.

parser/chapter3_chunks.json

Lines changed: 2378 additions & 0 deletions
Large diffs are not rendered by default.

parser/chapter4_chunks.json

Lines changed: 8786 additions & 0 deletions
Large diffs are not rendered by default.

parser/chapter5_chunks.json

Lines changed: 2468 additions & 0 deletions
Large diffs are not rendered by default.

parser/chunking.py

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
import os
2+
import re
3+
import json
4+
from glob import glob
5+
from itertools import groupby
6+
from difflib import SequenceMatcher
7+
from typing import List, Tuple
8+
9+
try:
10+
import nltk
11+
nltk.download("punkt", quiet=True)
12+
try:
13+
nltk.download("punkt_tab", quiet=True)
14+
except Exception:
15+
pass
16+
_NLTK_OK = True
17+
except Exception:
18+
nltk = None
19+
_NLTK_OK = False
20+
21+
try:
22+
import tiktoken
23+
_ENC = tiktoken.get_encoding("cl100k_base")
24+
except Exception:
25+
_ENC = None
26+
27+
# ---------- CONFIG ----------
28+
CHAPTER_JSON_DIR = "."
29+
CHAPTER_FILE_GLOB = "chapter*_chunks.json"
30+
OUTPUT_FILE = "sicp_mesochunks_semantic_rag.json"
31+
32+
MAX_TOKENS = 300 # smaller chunks for RAG
33+
OVERLAP_SENTENCES = 1
34+
HARD_SENT_WORD_SPLIT = 80
35+
HUGE_CODE_LINES = 25
36+
SYMBOL_DENSITY = 0.10
37+
38+
# ---------- HELPERS ----------
39+
def num_tokens(text: str) -> int:
40+
if _ENC:
41+
try:
42+
return len(_ENC.encode(text))
43+
except Exception:
44+
pass
45+
return max(1, len(text.split()))
46+
47+
def safe_sentence_tokenize(text: str) -> List[str]:
48+
if _NLTK_OK and nltk:
49+
try:
50+
return nltk.sent_tokenize(text)
51+
except Exception:
52+
pass
53+
parts = re.split(r'(?<=[.!?])\s+(?=[A-Z(0-9`])', text.strip())
54+
if len(parts) == 1:
55+
words = text.split()
56+
return [" ".join(words[i:i+HARD_SENT_WORD_SPLIT]) for i in range(0,len(words),HARD_SENT_WORD_SPLIT)] or [text]
57+
return parts
58+
59+
def is_symbol_dense(line: str) -> bool:
60+
symbols = re.findall(r"[()\[\]{};:+\-*/=<>|&^%$~,.`]", line)
61+
return (len(symbols) / max(1, len(line))) >= SYMBOL_DENSITY
62+
63+
def looks_like_code_line(line: str) -> bool:
64+
s = line.strip()
65+
if not s: return False
66+
if s == "```": return True
67+
if s.startswith((" ", "\t")): return True
68+
if re.match(r"^\s*(>>>|#|//|/\*|\*|\w+\s*=\s*)", s): return True
69+
if re.match(r"^\s*\(.*\)\s*$", s): return True # Lisp
70+
if re.match(r"^\s*(function\s+\w+\s*\(|\w+\s*\(.*\)\s*;?\s*|\{\s*|\}\s*)$", s): return True
71+
if re.match(r"^\s*[\d.]+\s*[\+\-\*/]\s*[\d.]+;?\s*$", s): return True
72+
if is_symbol_dense(s): return True
73+
return False
74+
75+
def detect_code_blocks(text: str) -> List[Tuple[str,str]]:
76+
segs, cur, is_code, fenced = [], [], False, False
77+
lines = text.splitlines()
78+
for i, raw in enumerate(lines):
79+
line = raw.rstrip("\n")
80+
if line.strip() == "```":
81+
if cur:
82+
segs.append(("code" if is_code or fenced else "text", "\n".join(cur)))
83+
cur = []
84+
fenced = not fenced
85+
is_code = fenced or is_code
86+
continue
87+
88+
# decide next_line for boundary detection
89+
next_line = lines[i+1].strip() if i+1 < len(lines) else ""
90+
91+
if fenced or looks_like_code_line(line):
92+
if not is_code and cur:
93+
segs.append(("text", "\n".join(cur)))
94+
cur = []
95+
is_code = True
96+
else:
97+
# boundary: code followed by capitalized prose
98+
if is_code and next_line and re.match(r"^[A-Z]", next_line):
99+
segs.append(("code", "\n".join(cur+[line])))
100+
cur, is_code = [], False
101+
continue
102+
if is_code and cur:
103+
segs.append(("code", "\n".join(cur)))
104+
cur = []
105+
is_code = False
106+
cur.append(line)
107+
if cur:
108+
segs.append(("code" if is_code or fenced else "text", "\n".join(cur)))
109+
return segs
110+
111+
def split_code_by_lines(block: str, window: int) -> List[str]:
112+
lines, out, cur = block.split("\n"), [], []
113+
for ln in lines:
114+
cur.append(ln)
115+
if len(cur) >= window or num_tokens("\n".join(cur)) >= MAX_TOKENS:
116+
out.append("\n".join(cur))
117+
cur = []
118+
if cur: out.append("\n".join(cur))
119+
return out
120+
121+
def split_huge_code_block(block: str) -> List[str]:
122+
if "function" in block and "(define" in block:
123+
return split_code_by_lines(block, 15)
124+
lines = block.split("\n")
125+
return split_code_by_lines(block, HUGE_CODE_LINES)
126+
127+
def clean_text_noise(txt: str) -> str:
128+
txt = re.sub(r'\b\w*_example(_\d+)?\b', '', txt)
129+
lines = [l for l in txt.splitlines() if l.strip()]
130+
uniq = []
131+
for l in lines:
132+
if not uniq or l.strip() != uniq[-1].strip():
133+
uniq.append(l)
134+
return "\n".join(uniq).strip()
135+
136+
def is_duplicate_code(a: str, b: str) -> bool:
137+
return SequenceMatcher(None, a.strip(), b.strip()).ratio() > 0.8
138+
139+
def chunk_by_tokens(text: str) -> List[str]:
140+
segs = detect_code_blocks(clean_text_noise(text))
141+
chunks, buf, tok = [], [], 0
142+
143+
def flush():
144+
nonlocal buf, tok
145+
if buf:
146+
chunks.append(" ".join(buf).strip())
147+
buf = []
148+
tok = 0
149+
150+
for t, seg in segs:
151+
if not seg.strip(): continue
152+
if t == "code":
153+
seg = seg.strip()
154+
seg_toks = num_tokens(seg)
155+
code_parts = [seg] if seg_toks <= MAX_TOKENS else split_huge_code_block(seg)
156+
# dedupe similar adjacent code blocks (Lisp vs JS)
157+
if buf and any(is_duplicate_code(seg, b) for b in buf if b.startswith("(") or b.startswith("function")):
158+
continue
159+
for part in code_parts:
160+
part_toks = num_tokens(part)
161+
if tok + part_toks > MAX_TOKENS: flush()
162+
buf.append(part)
163+
tok += part_toks
164+
if tok >= MAX_TOKENS: flush()
165+
continue
166+
167+
for s in safe_sentence_tokenize(seg):
168+
s = s.strip()
169+
if not s: continue
170+
stoks = num_tokens(s)
171+
if stoks > MAX_TOKENS:
172+
words = s.split()
173+
for i in range(0,len(words),HARD_SENT_WORD_SPLIT):
174+
sub = " ".join(words[i:i+HARD_SENT_WORD_SPLIT])
175+
if tok + num_tokens(sub) > MAX_TOKENS: flush()
176+
buf.append(sub)
177+
tok += num_tokens(sub)
178+
if tok >= MAX_TOKENS: flush()
179+
continue
180+
if tok + stoks > MAX_TOKENS: flush()
181+
buf.append(s)
182+
tok += stoks
183+
if buf: flush()
184+
return [c for c in chunks if c.strip()]
185+
186+
def make_chunk_key(e): return (e.get("title") or "", e.get("parent_title") or "", e.get("source_file") or "")
187+
188+
# ---------- MAIN ----------
189+
def main():
190+
all_chunks = []
191+
files = sorted(glob(os.path.join(CHAPTER_JSON_DIR, CHAPTER_FILE_GLOB)))
192+
print(f"📚 Processing {len(files)} chapter files...")
193+
194+
for fpath in files:
195+
with open(fpath, "r", encoding="utf-8") as f:
196+
data = json.load(f)
197+
198+
data = [d for d in data if isinstance(d, dict) and d.get("content")]
199+
data.sort(key=lambda x: (x.get("parent_title") or "", x.get("title") or "", x.get("paragraph_index", 0)))
200+
201+
for (title, parent, src), grp in groupby(data, key=make_chunk_key):
202+
paras = []
203+
seen = set()
204+
for p in grp:
205+
txt = clean_text_noise(p["content"])
206+
if txt and txt not in seen:
207+
seen.add(txt)
208+
paras.append(txt)
209+
merged = "\n".join(paras)
210+
if not merged.strip(): continue
211+
212+
subs = chunk_by_tokens(merged)
213+
base = os.path.splitext(os.path.basename(fpath))[0]
214+
safe_title = (title or "section").replace(" ", "_")[:60]
215+
for i, ch in enumerate(subs, 1):
216+
all_chunks.append({
217+
"chapter_file": os.path.basename(fpath),
218+
"section": parent or None,
219+
"subsection": title or None,
220+
"chunk_id": f"{base}_{safe_title}_{i}",
221+
"chunk_index": i,
222+
"content": ch,
223+
"token_count": num_tokens(ch),
224+
"source_files": [src]
225+
})
226+
227+
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
228+
json.dump(all_chunks, out, indent=2, ensure_ascii=False)
229+
230+
if all_chunks:
231+
avg = sum(c["token_count"] for c in all_chunks)/len(all_chunks)
232+
print(f"✅ Created {len(all_chunks)} chunks → {OUTPUT_FILE}")
233+
print(f"📊 Avg tokens: {avg:.1f}, Max: {max(c['token_count'] for c in all_chunks)}")
234+
else:
235+
print("⚠️ No chunks produced.")
236+
237+
if __name__ == "__main__":
238+
main()

parser/parse_sicp.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import os
2+
import re
3+
import xml.etree.ElementTree as ET
4+
import html
5+
import json
6+
7+
# Path to chapter folders
8+
SICP_XML_DIR = os.path.join(os.path.dirname(__file__), "..", "xml")
9+
10+
def parse_file(file_path, parent_title=None, depth=0):
11+
"""
12+
Recursively parse any XML file (chapter, section, or subsection).
13+
"""
14+
indent = " " * depth # for nice indentation in logs
15+
16+
if not os.path.exists(file_path):
17+
print(f"{indent}⚠️ Missing file: {file_path}")
18+
return []
19+
20+
print(f"{indent}📄 Parsing ({depth=}): {file_path}")
21+
22+
# Parse and unescape
23+
try:
24+
tree = ET.parse(file_path)
25+
root = tree.getroot()
26+
except Exception as e:
27+
print(f"{indent}❌ XML parse error in {file_path}: {e}")
28+
return []
29+
30+
xml_text = html.unescape(ET.tostring(root, encoding="unicode"))
31+
chunks = []
32+
33+
# Identify tag type
34+
tag_type = root.tag.upper()
35+
if root.find("NAME") is not None:
36+
title = " ".join(root.find("NAME").itertext())
37+
title = re.sub(r"\s+", " ", title).strip()
38+
else:
39+
title = "Untitled"
40+
41+
# Extract text paragraphs
42+
text_blocks = root.findall(".//TEXT")
43+
print(f"{indent}🧩 Found {len(text_blocks)} <TEXT> blocks in {os.path.basename(file_path)}")
44+
45+
for i, t in enumerate(text_blocks, start=1):
46+
for bad_tag in ["INDEX", "LABEL", "CITATION", "FOOTNOTE", "COMMENT", "WEB_ONLY"]:
47+
for el in t.findall(f".//{bad_tag}"):
48+
el.clear()
49+
50+
text_content = " ".join(t.itertext()).strip()
51+
text_content = re.sub(r"\s+", " ", text_content)
52+
53+
if text_content:
54+
chunks.append({
55+
"source_file": os.path.basename(file_path),
56+
"tag_type": tag_type,
57+
"title": title,
58+
"parent_title": parent_title,
59+
"depth": depth,
60+
"paragraph_index": i,
61+
"content": text_content
62+
})
63+
64+
# Look for section and subsection references
65+
section_refs = re.findall(r"&section([\d\.]+);", xml_text)
66+
subsection_refs = re.findall(r"&subsection([\d\.]+);", xml_text)
67+
68+
if section_refs:
69+
print(f"{indent}🔍 Found {len(section_refs)} section ref(s): {section_refs}")
70+
if subsection_refs:
71+
print(f"{indent} ↳ Found {len(subsection_refs)} subsection ref(s): {subsection_refs}")
72+
73+
# Recurse into sections
74+
for ref in section_refs:
75+
section_folder = os.path.join(os.path.dirname(file_path), f"section{ref.split('.')[0]}")
76+
section_file = os.path.join(section_folder, f"section{ref.split('.')[0]}.xml")
77+
print(f"{indent}➡️ Going into section file: {section_file}")
78+
chunks.extend(parse_file(section_file, parent_title=title, depth=depth + 1))
79+
80+
# Recurse into subsections
81+
for ref in subsection_refs:
82+
subsection_file = os.path.join(os.path.dirname(file_path), f"subsection{ref.split('.')[0]}.xml")
83+
print(f"{indent}➡️ Going into subsection file: {subsection_file}")
84+
chunks.extend(parse_file(subsection_file, parent_title=title, depth=depth + 1))
85+
86+
print(f"{indent}✅ Done parsing {os.path.basename(file_path)}, total chunks so far: {len(chunks)}\n")
87+
return chunks
88+
89+
if __name__ == "__main__":
90+
print("🚀 Starting full SICP parse\n")
91+
92+
# ✅ Automatically detect all chapter folders (chapter1, chapter2, ...)
93+
for chapter_dir in sorted(os.listdir(SICP_XML_DIR)):
94+
if not chapter_dir.startswith("chapter"):
95+
continue
96+
97+
chapter_path = os.path.join(SICP_XML_DIR, chapter_dir, f"{chapter_dir}.xml")
98+
if not os.path.exists(chapter_path):
99+
print(f"⚠️ Skipping {chapter_dir}: main XML not found\n")
100+
continue
101+
102+
print(f"\n==============================")
103+
print(f"📘 Parsing {chapter_dir}")
104+
print(f"==============================")
105+
106+
all_chunks = parse_file(chapter_path)
107+
print(f"✅ Extracted {len(all_chunks)} chunks for {chapter_dir}\n")
108+
109+
# Save separate JSON for each chapter
110+
out_path = os.path.join(os.path.dirname(__file__), f"{chapter_dir}_chunks.json")
111+
with open(out_path, "w", encoding="utf-8") as f:
112+
json.dump(all_chunks, f, indent=2, ensure_ascii=False)
113+
114+
print(f"💾 Saved {chapter_dir}_chunks.json ({len(all_chunks)} chunks)\n")
115+
116+
print("🏁 All chapters processed successfully!")

0 commit comments

Comments
 (0)