|
| 1 | +import os |
| 2 | +import re |
| 3 | +import json |
| 4 | +from glob import glob |
| 5 | +from itertools import groupby |
| 6 | +from difflib import SequenceMatcher |
| 7 | +from typing import List, Tuple |
| 8 | + |
| 9 | +try: |
| 10 | + import nltk |
| 11 | + nltk.download("punkt", quiet=True) |
| 12 | + try: |
| 13 | + nltk.download("punkt_tab", quiet=True) |
| 14 | + except Exception: |
| 15 | + pass |
| 16 | + _NLTK_OK = True |
| 17 | +except Exception: |
| 18 | + nltk = None |
| 19 | + _NLTK_OK = False |
| 20 | + |
| 21 | +try: |
| 22 | + import tiktoken |
| 23 | + _ENC = tiktoken.get_encoding("cl100k_base") |
| 24 | +except Exception: |
| 25 | + _ENC = None |
| 26 | + |
| 27 | +# ---------- CONFIG ---------- |
| 28 | +CHAPTER_JSON_DIR = "." |
| 29 | +CHAPTER_FILE_GLOB = "chapter*_chunks.json" |
| 30 | +OUTPUT_FILE = "sicp_mesochunks_semantic_rag.json" |
| 31 | + |
| 32 | +MAX_TOKENS = 300 # smaller chunks for RAG |
| 33 | +OVERLAP_SENTENCES = 1 |
| 34 | +HARD_SENT_WORD_SPLIT = 80 |
| 35 | +HUGE_CODE_LINES = 25 |
| 36 | +SYMBOL_DENSITY = 0.10 |
| 37 | + |
| 38 | +# ---------- HELPERS ---------- |
| 39 | +def num_tokens(text: str) -> int: |
| 40 | + if _ENC: |
| 41 | + try: |
| 42 | + return len(_ENC.encode(text)) |
| 43 | + except Exception: |
| 44 | + pass |
| 45 | + return max(1, len(text.split())) |
| 46 | + |
| 47 | +def safe_sentence_tokenize(text: str) -> List[str]: |
| 48 | + if _NLTK_OK and nltk: |
| 49 | + try: |
| 50 | + return nltk.sent_tokenize(text) |
| 51 | + except Exception: |
| 52 | + pass |
| 53 | + parts = re.split(r'(?<=[.!?])\s+(?=[A-Z(0-9`])', text.strip()) |
| 54 | + if len(parts) == 1: |
| 55 | + words = text.split() |
| 56 | + return [" ".join(words[i:i+HARD_SENT_WORD_SPLIT]) for i in range(0,len(words),HARD_SENT_WORD_SPLIT)] or [text] |
| 57 | + return parts |
| 58 | + |
| 59 | +def is_symbol_dense(line: str) -> bool: |
| 60 | + symbols = re.findall(r"[()\[\]{};:+\-*/=<>|&^%$~,.`]", line) |
| 61 | + return (len(symbols) / max(1, len(line))) >= SYMBOL_DENSITY |
| 62 | + |
| 63 | +def looks_like_code_line(line: str) -> bool: |
| 64 | + s = line.strip() |
| 65 | + if not s: return False |
| 66 | + if s == "```": return True |
| 67 | + if s.startswith((" ", "\t")): return True |
| 68 | + if re.match(r"^\s*(>>>|#|//|/\*|\*|\w+\s*=\s*)", s): return True |
| 69 | + if re.match(r"^\s*\(.*\)\s*$", s): return True # Lisp |
| 70 | + if re.match(r"^\s*(function\s+\w+\s*\(|\w+\s*\(.*\)\s*;?\s*|\{\s*|\}\s*)$", s): return True |
| 71 | + if re.match(r"^\s*[\d.]+\s*[\+\-\*/]\s*[\d.]+;?\s*$", s): return True |
| 72 | + if is_symbol_dense(s): return True |
| 73 | + return False |
| 74 | + |
| 75 | +def detect_code_blocks(text: str) -> List[Tuple[str,str]]: |
| 76 | + segs, cur, is_code, fenced = [], [], False, False |
| 77 | + lines = text.splitlines() |
| 78 | + for i, raw in enumerate(lines): |
| 79 | + line = raw.rstrip("\n") |
| 80 | + if line.strip() == "```": |
| 81 | + if cur: |
| 82 | + segs.append(("code" if is_code or fenced else "text", "\n".join(cur))) |
| 83 | + cur = [] |
| 84 | + fenced = not fenced |
| 85 | + is_code = fenced or is_code |
| 86 | + continue |
| 87 | + |
| 88 | + # decide next_line for boundary detection |
| 89 | + next_line = lines[i+1].strip() if i+1 < len(lines) else "" |
| 90 | + |
| 91 | + if fenced or looks_like_code_line(line): |
| 92 | + if not is_code and cur: |
| 93 | + segs.append(("text", "\n".join(cur))) |
| 94 | + cur = [] |
| 95 | + is_code = True |
| 96 | + else: |
| 97 | + # boundary: code followed by capitalized prose |
| 98 | + if is_code and next_line and re.match(r"^[A-Z]", next_line): |
| 99 | + segs.append(("code", "\n".join(cur+[line]))) |
| 100 | + cur, is_code = [], False |
| 101 | + continue |
| 102 | + if is_code and cur: |
| 103 | + segs.append(("code", "\n".join(cur))) |
| 104 | + cur = [] |
| 105 | + is_code = False |
| 106 | + cur.append(line) |
| 107 | + if cur: |
| 108 | + segs.append(("code" if is_code or fenced else "text", "\n".join(cur))) |
| 109 | + return segs |
| 110 | + |
| 111 | +def split_code_by_lines(block: str, window: int) -> List[str]: |
| 112 | + lines, out, cur = block.split("\n"), [], [] |
| 113 | + for ln in lines: |
| 114 | + cur.append(ln) |
| 115 | + if len(cur) >= window or num_tokens("\n".join(cur)) >= MAX_TOKENS: |
| 116 | + out.append("\n".join(cur)) |
| 117 | + cur = [] |
| 118 | + if cur: out.append("\n".join(cur)) |
| 119 | + return out |
| 120 | + |
| 121 | +def split_huge_code_block(block: str) -> List[str]: |
| 122 | + if "function" in block and "(define" in block: |
| 123 | + return split_code_by_lines(block, 15) |
| 124 | + lines = block.split("\n") |
| 125 | + return split_code_by_lines(block, HUGE_CODE_LINES) |
| 126 | + |
| 127 | +def clean_text_noise(txt: str) -> str: |
| 128 | + txt = re.sub(r'\b\w*_example(_\d+)?\b', '', txt) |
| 129 | + lines = [l for l in txt.splitlines() if l.strip()] |
| 130 | + uniq = [] |
| 131 | + for l in lines: |
| 132 | + if not uniq or l.strip() != uniq[-1].strip(): |
| 133 | + uniq.append(l) |
| 134 | + return "\n".join(uniq).strip() |
| 135 | + |
| 136 | +def is_duplicate_code(a: str, b: str) -> bool: |
| 137 | + return SequenceMatcher(None, a.strip(), b.strip()).ratio() > 0.8 |
| 138 | + |
| 139 | +def chunk_by_tokens(text: str) -> List[str]: |
| 140 | + segs = detect_code_blocks(clean_text_noise(text)) |
| 141 | + chunks, buf, tok = [], [], 0 |
| 142 | + |
| 143 | + def flush(): |
| 144 | + nonlocal buf, tok |
| 145 | + if buf: |
| 146 | + chunks.append(" ".join(buf).strip()) |
| 147 | + buf = [] |
| 148 | + tok = 0 |
| 149 | + |
| 150 | + for t, seg in segs: |
| 151 | + if not seg.strip(): continue |
| 152 | + if t == "code": |
| 153 | + seg = seg.strip() |
| 154 | + seg_toks = num_tokens(seg) |
| 155 | + code_parts = [seg] if seg_toks <= MAX_TOKENS else split_huge_code_block(seg) |
| 156 | + # dedupe similar adjacent code blocks (Lisp vs JS) |
| 157 | + if buf and any(is_duplicate_code(seg, b) for b in buf if b.startswith("(") or b.startswith("function")): |
| 158 | + continue |
| 159 | + for part in code_parts: |
| 160 | + part_toks = num_tokens(part) |
| 161 | + if tok + part_toks > MAX_TOKENS: flush() |
| 162 | + buf.append(part) |
| 163 | + tok += part_toks |
| 164 | + if tok >= MAX_TOKENS: flush() |
| 165 | + continue |
| 166 | + |
| 167 | + for s in safe_sentence_tokenize(seg): |
| 168 | + s = s.strip() |
| 169 | + if not s: continue |
| 170 | + stoks = num_tokens(s) |
| 171 | + if stoks > MAX_TOKENS: |
| 172 | + words = s.split() |
| 173 | + for i in range(0,len(words),HARD_SENT_WORD_SPLIT): |
| 174 | + sub = " ".join(words[i:i+HARD_SENT_WORD_SPLIT]) |
| 175 | + if tok + num_tokens(sub) > MAX_TOKENS: flush() |
| 176 | + buf.append(sub) |
| 177 | + tok += num_tokens(sub) |
| 178 | + if tok >= MAX_TOKENS: flush() |
| 179 | + continue |
| 180 | + if tok + stoks > MAX_TOKENS: flush() |
| 181 | + buf.append(s) |
| 182 | + tok += stoks |
| 183 | + if buf: flush() |
| 184 | + return [c for c in chunks if c.strip()] |
| 185 | + |
| 186 | +def make_chunk_key(e): return (e.get("title") or "", e.get("parent_title") or "", e.get("source_file") or "") |
| 187 | + |
| 188 | +# ---------- MAIN ---------- |
| 189 | +def main(): |
| 190 | + all_chunks = [] |
| 191 | + files = sorted(glob(os.path.join(CHAPTER_JSON_DIR, CHAPTER_FILE_GLOB))) |
| 192 | + print(f"📚 Processing {len(files)} chapter files...") |
| 193 | + |
| 194 | + for fpath in files: |
| 195 | + with open(fpath, "r", encoding="utf-8") as f: |
| 196 | + data = json.load(f) |
| 197 | + |
| 198 | + data = [d for d in data if isinstance(d, dict) and d.get("content")] |
| 199 | + data.sort(key=lambda x: (x.get("parent_title") or "", x.get("title") or "", x.get("paragraph_index", 0))) |
| 200 | + |
| 201 | + for (title, parent, src), grp in groupby(data, key=make_chunk_key): |
| 202 | + paras = [] |
| 203 | + seen = set() |
| 204 | + for p in grp: |
| 205 | + txt = clean_text_noise(p["content"]) |
| 206 | + if txt and txt not in seen: |
| 207 | + seen.add(txt) |
| 208 | + paras.append(txt) |
| 209 | + merged = "\n".join(paras) |
| 210 | + if not merged.strip(): continue |
| 211 | + |
| 212 | + subs = chunk_by_tokens(merged) |
| 213 | + base = os.path.splitext(os.path.basename(fpath))[0] |
| 214 | + safe_title = (title or "section").replace(" ", "_")[:60] |
| 215 | + for i, ch in enumerate(subs, 1): |
| 216 | + all_chunks.append({ |
| 217 | + "chapter_file": os.path.basename(fpath), |
| 218 | + "section": parent or None, |
| 219 | + "subsection": title or None, |
| 220 | + "chunk_id": f"{base}_{safe_title}_{i}", |
| 221 | + "chunk_index": i, |
| 222 | + "content": ch, |
| 223 | + "token_count": num_tokens(ch), |
| 224 | + "source_files": [src] |
| 225 | + }) |
| 226 | + |
| 227 | + with open(OUTPUT_FILE, "w", encoding="utf-8") as out: |
| 228 | + json.dump(all_chunks, out, indent=2, ensure_ascii=False) |
| 229 | + |
| 230 | + if all_chunks: |
| 231 | + avg = sum(c["token_count"] for c in all_chunks)/len(all_chunks) |
| 232 | + print(f"✅ Created {len(all_chunks)} chunks → {OUTPUT_FILE}") |
| 233 | + print(f"📊 Avg tokens: {avg:.1f}, Max: {max(c['token_count'] for c in all_chunks)}") |
| 234 | + else: |
| 235 | + print("⚠️ No chunks produced.") |
| 236 | + |
| 237 | +if __name__ == "__main__": |
| 238 | + main() |
0 commit comments