git clone https://github.com/bcfeen/DocMine.git
cd DocMine
python -m venv .venv
source .venv/bin/activate # Windows: .venv\Scripts\activate
pip install -r requirements.txt
pip install -e .python validate_kos.pyExpected output: ✅ ALL TESTS PASSED
from docmine.kos_pipeline import KOSPipeline
# Initialize
pipeline = KOSPipeline(namespace="demo")
# Ingest (supports PDF, MD, TXT)
pipeline.ingest_file("document.pdf")
# Semantic search
results = pipeline.search("your query", top_k=5)
for r in results:
print(f"{r['text']} (score: {r['score']:.2f})")
# Exact recall (find ALL mentions)
segments = pipeline.search_entity("CCNA001")
print(f"Found {len(segments)} segments (complete)")
# List entities
entities = pipeline.list_entities()
for e in entities[:5]:
print(f"{e['name']}: {e['mention_count']} mentions")| Task | Old Way | New Way (KOS) |
|---|---|---|
| Import | from docmine.pipeline import PDFPipeline |
from docmine.kos_pipeline import KOSPipeline |
| Initialize | PDFPipeline() |
KOSPipeline(namespace="project") |
| Ingest | ingest_file("doc.pdf") |
Same, but idempotent! |
| Search | search("query") |
Same, returns provenance |
| Exact recall | ❌ Not available | search_entity("CCNA001") ✅ |
| Entities | ❌ Not available | list_entities() ✅ |
pipeline.ingest_directory("./papers", pattern="*.pdf")# Only re-processes files with different content_hash
pipeline.reingest_changed(namespace="project")# Find ALL mentions (semantic search can miss some)
entity = pipeline.get_entity("BRCA1", entity_type="gene")
if entity:
segments = pipeline.get_segments_for_entity(entity.id)
for seg in segments:
print(f"Source: {seg['source_uri']}")
print(f"Page: {seg['provenance']['page']}")
print(f"Text: {seg['text']}\n")# Semantic (fuzzy, incomplete)
semantic = pipeline.search("CCNA001", top_k=10)
# Exact (complete, guaranteed)
exact = pipeline.search_entity("CCNA001")
print(f"Semantic found: {len(semantic)}")
print(f"Exact found: {len(exact)}") # Usually >= semantic# Separate namespaces
pipeline.ingest_file("alpha.pdf", namespace="lab_alpha")
pipeline.ingest_file("beta.pdf", namespace="lab_beta")
# Search within namespace
results_alpha = pipeline.search("query", namespace="lab_alpha")
results_beta = pipeline.search("query", namespace="lab_beta")from docmine.extraction import RegexEntityExtractor
extractor = RegexEntityExtractor()
extractor.add_pattern("custom", r"\bCUST-\d{4}\b")
pipeline = KOSPipeline(entity_extractor=extractor)stats = pipeline.stats(namespace="project")
print(stats)
# {
# "namespace": "project",
# "information_resources": 10,
# "segments": 1420,
# "entities": 45,
# "entity_types": 5
# }python validate_kos.pypip install pytest
pytest tests/ -v# Idempotency tests
pytest tests/test_idempotency.py -v
# Exact recall tests
pytest tests/test_exact_recall.py -vIf you have existing chunk data:
python scripts/migrate_legacy_chunks.py \
--old-db knowledge.duckdb \
--new-db knowledge_kos.duckdb \
--namespace legacyCheck namespace isolation:
# Wrong: mixing namespaces
pipeline.ingest_file("doc.pdf", namespace="ns1")
pipeline.count_segments(namespace="ns2") # Returns 0!
# Right: use same namespace
pipeline.ingest_file("doc.pdf", namespace="ns1")
pipeline.count_segments(namespace="ns1") # CorrectCheck your entity extractor patterns:
entities = pipeline.list_entities()
if not entities:
# Try with default patterns
from docmine.extraction import RegexEntityExtractor
extractor = RegexEntityExtractor()
print(extractor.list_patterns()) # See what patterns are activeEnsure embeddings were generated:
# Embeddings are created automatically during ingest
# But you can verify:
pipeline.ingest_file("doc.pdf")
results = pipeline.search("test query")
if not results:
# Check segment count
count = pipeline.count_segments()
print(f"Segments: {count}")python examples/kos_demo.py- Quick:
README_KOS.md - Deep:
docs/knowledge_centric_migration.md
# Old system: creates duplicates
old_pipeline.ingest_file("doc.pdf") # 100 chunks
old_pipeline.ingest_file("doc.pdf") # 200 chunks (duplicates!)
# New system: idempotent
pipeline.ingest_file("doc.pdf") # 100 segments
pipeline.ingest_file("doc.pdf") # Still 100 segments ✓# Old system: auto-increment IDs (not stable)
# chunk_id = 1, 2, 3... (changes on re-ingest)
# New system: deterministic hashes
# segment_id = sha256(namespace + uri + provenance + text)
# Same document → same IDs, always# Old system: basic page info
{"page_num": 5, "chunk_index": 3}
# New system: full provenance
{
"page": 5,
"sentence": 3,
"sentence_count": 3,
"source_uri": "file:///path/doc.pdf"
}- Read the full README:
README_KOS.md - Understand the architecture:
docs/knowledge_centric_migration.md - Run examples:
python examples/kos_demo.py - Write your own code: Use the API examples above
- Documentation:
README_KOS.md,docs/ - Tests:
tests/ - Examples:
examples/ - Issues: https://github.com/bcfeen/DocMine/issues
DocMine KOS: Production-ready knowledge extraction
✅ Idempotent • ✅ Stable IDs • ✅ Entity Tracking • ✅ Exact Recall • ✅ Multi-Corpus