standardbeagle · andylbrummer · Jun 19, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -13,10 +13,17 @@ Testing/
 # dartai loop state (machine-local)
 .dartai/
 
-# Real projects test data (git submodules)
+# Real projects test data (fetched via scripts/add-real-projects.sh)
 real_projects/go/
 real_projects/python/
 real_projects/typescript/
+real_projects/java/
+real_projects/csharp/
+real_projects/rust/
+real_projects/php/
+real_projects/kotlin/
+real_projects/ruby/
+real_projects/zig/
 
 # worktrack loop state (per-clone, may contain private email in lease_holder)
 .worktrack/
diff --git a/include/lci/config.h b/include/lci/config.h
@@ -17,6 +17,11 @@ struct ProjectConfig {
 
 struct IndexConfig {
     int64_t max_file_size = 10 * 1024 * 1024;      // 10 MB
+    // Files larger than this are still trigram-indexed for text search but skip
+    // the tree-sitter parse + symbol extraction: a multi-MB source file is
+    // almost always generated/minified, where the parse cost (parse is ~58% of
+    // index CPU) buys little symbol value. 0 disables the cap.
+    int64_t max_parse_file_size = 2 * 1024 * 1024;  // 2 MB
     int64_t max_total_size_mb = 500;
     int max_file_count = 10000;
     bool follow_symlinks = false;

diff --git a/include/lci/indexing/pipeline_types.h b/include/lci/indexing/pipeline_types.h
@@ -63,6 +63,7 @@ struct ProcessedFile {
     std::chrono::nanoseconds duration{};
     Error error{};
     bool has_error{};
+    bool parse_skipped_oversize{};  // trigram-indexed, tree-sitter skipped
 };
 
 /// Pipeline buffer size constants.

diff --git a/real_projects/README.md b/real_projects/README.md
@@ -28,8 +28,8 @@ git submodule update --recursive --remote
 Or use the provided setup script:
 
 ```bash
-./scripts/add-real-projects.sh --minimal   # 2 projects (fast)
-./scripts/add-real-projects.sh --full      # All 9 projects
+./scripts/add-real-projects.sh --minimal   # 3 projects (fast: Go/Python/TS)
+./scripts/add-real-projects.sh --full       # All 16 projects (covers 13 languages)
 ```
 
 ## Directory Structure
@@ -48,6 +48,13 @@ real_projects/
 │   ├── next.js/          # Next.js React framework
 │   ├── shadcn-ui/        # UI component library
 │   └── trpc/             # TypeScript RPC framework
+├── java/gson/             # Call-graph corpora for the scope-type-resolution
+├── csharp/serilog/        # languages (each had no call graph before that work).
+├── rust/ripgrep/          # real_project_languages_test.cpp asserts receiver-
+├── php/guzzle/            # type resolution fires on each of these real repos.
+├── kotlin/okhttp/
+├── ruby/sinatra/
+├── zig/zls/
 └── README.md             # This file
 ```
 

diff --git a/scripts/add-real-projects.sh b/scripts/add-real-projects.sh
@@ -31,6 +31,16 @@ PROJECTS=(
     "typescript:next.js:https://github.com/vercel/next.js.git"
     "typescript:shadcn-ui:https://github.com/shadcn-ui/ui.git"
     "typescript:trpc:https://github.com/trpc/trpc.git"
+    # Call-graph corpora for the scope-type-resolution languages. Each had no
+    # call graph before that work; real_project_languages_test.cpp asserts
+    # receiver-type resolution fires on these real repos.
+    "java:gson:https://github.com/google/gson.git"
+    "csharp:serilog:https://github.com/serilog/serilog.git"
+    "rust:ripgrep:https://github.com/BurntSushi/ripgrep.git"
+    "php:guzzle:https://github.com/guzzle/guzzle.git"
+    "kotlin:okhttp:https://github.com/square/okhttp.git"
+    "ruby:sinatra:https://github.com/sinatra/sinatra.git"
+    "zig:zls:https://github.com/zigtools/zls.git"
 )
 
 # Minimal projects for initial setup. trpc keeps the TS surface
@@ -90,7 +100,7 @@ fi
 
 # Create real_projects directories
 log "Creating directory structure..."
-mkdir -p real_projects/{go,python,typescript}
+mkdir -p real_projects/{go,python,typescript,java,csharp,rust,php,kotlin,ruby,zig}
 
 ADDED=0
 FAILED=0

diff --git a/src/config/config.cpp b/src/config/config.cpp
@@ -353,6 +353,14 @@ void apply_index(Config& cfg, const KdlNode& node) {
                 int v = 0;
                 if (get_int(child, v)) cfg.index.max_file_size = v;
             }
+        } else if (child.name == "max_parse_file_size") {
+            std::string sz;
+            if (get_string(child, sz)) {
+                cfg.index.max_parse_file_size = parse_size_string(sz);
+            } else {
+                int v = 0;
+                if (get_int(child, v)) cfg.index.max_parse_file_size = v;
+            }
         } else if (child.name == "max_total_size_mb") {
             int v = 0;
             if (get_int(child, v)) cfg.index.max_total_size_mb = v;

diff --git a/src/indexing/pipeline_processor.cpp b/src/indexing/pipeline_processor.cpp
@@ -23,7 +23,8 @@ namespace {
 /// that trigram/postings indexing still proceeds.
 void run_unified_extraction(ProcessedFile& result,
                             std::string_view content,
-                            const std::string& path) {
+                            const std::string& path,
+                            int64_t max_parse_bytes) {
     auto ext = std::filesystem::path(path).extension().string();
     if (ext.empty()) return;
 
@@ -32,6 +33,15 @@ void run_unified_extraction(ProcessedFile& result,
         return;  // Unsupported language: trigrams still index for text search.
     }
 
+    // Oversized source: skip the tree-sitter parse (the expensive stage) but
+    // keep trigram text indexing. Surface the skip rather than silently
+    // dropping symbols for the file.
+    if (max_parse_bytes > 0 &&
+        static_cast<int64_t>(content.size()) > max_parse_bytes) {
+        result.parse_skipped_oversize = true;
+        return;
+    }
+
     parser::PooledParser parser_guard(lang);
     if (!parser_guard) return;
 
@@ -182,7 +192,8 @@ ProcessedFile FileProcessor::process_file(int /*worker_id*/,
     // tree-sitter. This populates the symbol-aware data the integrator
     // feeds into ReferenceTracker. Without this step, browse-file,
     // list-symbols, references, and tree endpoints all return empty.
-    run_unified_extraction(result, content, task.path);
+    run_unified_extraction(result, content, task.path,
+                           config_.index.max_parse_file_size);
 
     // Bucket trigrams during processing (zero-lock per-file)
     if (trigram_index_ != nullptr && content.size() >= 3) {

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -187,6 +187,7 @@ add_executable(lci_real_project_tests
     integration/real_project_side_effects_test.cpp
     integration/real_project_typescript_test.cpp
     integration/real_project_feature_audit_test.cpp
+    integration/real_project_languages_test.cpp
     integration/spec_runner.cpp
 )
 target_include_directories(lci_real_project_tests PRIVATE

diff --git a/tests/integration/real_project_languages_test.cpp b/tests/integration/real_project_languages_test.cpp
@@ -0,0 +1,91 @@
+// Real-project call-graph resolution across all newly-wired languages.
+//
+// Each of these seven languages had NO call graph before scope-based
+// receiver-type resolution was added. These tests index a real upstream repo
+// per language and assert that (a) symbols are extracted and (b) a known
+// method call resolves to a receiver-type-qualified target (`Type.method`),
+// i.e. the SCIP-base-case resolution fires on real code, not just the
+// controlled corpora in the unit suite.
+//
+// Skips gracefully when the corpus is absent (run ./scripts/add-real-projects.sh).
+
+#include <gtest/gtest.h>
+
+#include <lci/config.h>
+#include <lci/core/reference_tracker.h>
+#include <lci/indexing/master_index.h>
+
+#include <string>
+#include <vector>
+
+#include "helpers/real_project_helpers.h"
+
+namespace lci {
+namespace {
+
+// Indexes <lang>/<repo>, asserts symbols were extracted, then confirms the
+// sentinel method's callee set contains the expected receiver-type-qualified
+// edge — proving type resolution works for the language on real source.
+void expect_qualified_callee(const std::string& lang, const std::string& repo,
+                             const std::string& sentinel,
+                             const std::string& qualified_callee) {
+    auto path = testing::find_real_project(lang, repo);
+    if (!path) {
+        GTEST_SKIP() << "Real project not found: " << lang << "/" << repo
+                     << ". Run ./scripts/add-real-projects.sh";
+    }
+    auto ctx = testing::setup_real_project(*path, repo);
+    ASSERT_TRUE(ctx.valid()) << "Failed to index " << lang << "/" << repo;
+    EXPECT_GT(ctx.indexer->file_count(), 0);
+
+    const auto& rt = ctx.indexer->ref_tracker();
+    auto syms = rt.find_symbols_by_name(sentinel);
+    ASSERT_FALSE(syms.empty())
+        << "sentinel symbol '" << sentinel << "' not extracted in " << repo;
+
+    bool found = false;
+    for (const auto* s : syms) {
+        for (const auto& callee : rt.get_callee_names(s->id)) {
+            if (callee == qualified_callee) {
+                found = true;
+                break;
+            }
+        }
+        if (found) break;
+    }
+    EXPECT_TRUE(found) << lang << "/" << repo << ": " << sentinel
+                       << " should have a receiver-type-qualified callee '"
+                       << qualified_callee << "'";
+}
+
+TEST(RealProjectLanguages, JavaGsonResolvesReceiverType) {
+    expect_qualified_callee("java", "gson", "toJson", "Gson.toJson");
+}
+
+TEST(RealProjectLanguages, CSharpSerilogResolvesReceiverType) {
+    expect_qualified_callee("csharp", "serilog", "Write", "Logger.IsEnabled");
+}
+
+TEST(RealProjectLanguages, RustRipgrepResolvesReceiverType) {
+    expect_qualified_callee("rust", "ripgrep", "build", "GlobSetBuilder.add");
+}
+
+TEST(RealProjectLanguages, PhpGuzzleResolvesReceiverType) {
+    expect_qualified_callee("php", "guzzle", "send", "Client.sendAsync");
+}
+
+TEST(RealProjectLanguages, KotlinOkhttpResolvesReceiverType) {
+    expect_qualified_callee("kotlin", "okhttp", "intercept", "Chain.request");
+}
+
+TEST(RealProjectLanguages, RubySinatraResolvesReceiverType) {
+    expect_qualified_callee("ruby", "sinatra", "call", "ExtendedRack.setup_close");
+}
+
+TEST(RealProjectLanguages, ZigZlsResolvesReceiverType) {
+    expect_qualified_callee("zig", "zls", "resolveTypeOfNode",
+                            "Analyser.resolveBindingOfNode");
+}
+
+}  // namespace
+}  // namespace lci
diff --git a/tests/language_extraction_test.cpp b/tests/language_extraction_test.cpp
@@ -4,8 +4,17 @@
 #include <gtest/gtest.h>
 #include <tree_sitter/api.h>
 
+#include <algorithm>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <map>
+#include <sstream>
 #include <string>
 #include <string_view>
+#include <vector>
 
 namespace lci::parser {
 namespace {
@@ -987,5 +996,129 @@ fn go() void {
     EXPECT_TRUE(has_call_ref(r, "A.run"));    // const a = A{}
 }
 
+// ---------------------------------------------------------------------------
+// Stage-timing profiler (manual; set LCI_PROFILE_DIR=<path> to run).
+// Walks the tree, and for each known-language source file times tree-sitter
+// parse vs. UnifiedExtractor extraction vs. a representative trigram pass.
+// Reports the per-stage totals, throughput, and the slowest single files so a
+// pathological input (huge minified/generated source) is named, not hidden.
+// ---------------------------------------------------------------------------
+TEST(IndexProfile, StageBreakdown) {
+    const char* dir = std::getenv("LCI_PROFILE_DIR");
+    if (dir == nullptr) GTEST_SKIP() << "set LCI_PROFILE_DIR to run";
+
+    namespace fs = std::filesystem;
+    using clk = std::chrono::steady_clock;
+    auto ms = [](clk::duration d) {
+        return std::chrono::duration<double, std::milli>(d).count();
+    };
+
+    struct FileStat {
+        std::string path;
+        size_t bytes = 0;
+        double parse_ms = 0, extract_ms = 0, trigram_ms = 0;
+        size_t symbols = 0, refs = 0;
+    };
+    std::vector<FileStat> stats;
+    double tot_read = 0, tot_parse = 0, tot_extract = 0, tot_trigram = 0;
+    size_t tot_bytes = 0, tot_syms = 0, tot_refs = 0;
+
+    std::error_code ec;
+    for (auto it = fs::recursive_directory_iterator(
+             dir, fs::directory_options::skip_permission_denied, ec);
+         it != fs::recursive_directory_iterator(); it.increment(ec)) {
+        if (ec) break;
+        if (!it->is_regular_file(ec) || ec) continue;
+        std::string p = it->path().string();
+        if (p.find("/.git/") != std::string::npos) continue;
+        std::string ext = it->path().extension().string();
+        Language lang{};
+        if (!language_from_extension(ext, lang)) continue;
+
+        auto t0 = clk::now();
+        std::ifstream f(p, std::ios::binary);
+        std::stringstream ss;
+        ss << f.rdbuf();
+        std::string content = ss.str();
+        auto t1 = clk::now();
+        if (content.empty()) continue;
+
+        auto tree = parse(lang, content);
+        auto t2 = clk::now();
+        if (!tree) continue;
+
+        UnifiedExtractor ue;
+        ue.init(content, 1, ext, p);
+        ue.extract(tree.get());
+        auto r = ue.get_results();
+        auto t3 = clk::now();
+
+        // Representative trigram pass: 3-byte sliding window into a set.
+        std::vector<uint32_t> tris;
+        tris.reserve(content.size());
+        const auto* b = reinterpret_cast<const uint8_t*>(content.data());
+        for (size_t i = 0; i + 2 < content.size(); ++i)
+            tris.push_back((uint32_t(b[i]) << 16) | (uint32_t(b[i + 1]) << 8) |
+                           uint32_t(b[i + 2]));
+        std::sort(tris.begin(), tris.end());
+        tris.erase(std::unique(tris.begin(), tris.end()), tris.end());
+        auto t4 = clk::now();
+
+        FileStat fsr;
+        fsr.path = p;
+        fsr.bytes = content.size();
+        fsr.parse_ms = ms(t2 - t1);
+        fsr.extract_ms = ms(t3 - t2);
+        fsr.trigram_ms = ms(t4 - t3);
+        fsr.symbols = r.symbols.size();
+        fsr.refs = r.references.size();
+        tot_read += ms(t1 - t0);
+        tot_parse += fsr.parse_ms;
+        tot_extract += fsr.extract_ms;
+        tot_trigram += fsr.trigram_ms;
+        tot_bytes += content.size();
+        tot_syms += r.symbols.size();
+        tot_refs += r.references.size();
+        stats.push_back(std::move(fsr));
+    }
+
+    std::map<std::string, std::pair<double, int>> by_ext;  // ext -> {parse,n}
+    for (const auto& s : stats) {
+        auto e = fs::path(s.path).extension().string();
+        by_ext[e].first += s.parse_ms + s.extract_ms;
+        by_ext[e].second++;
+    }
+
+    double cpu = tot_parse + tot_extract + tot_trigram;
+    fprintf(stderr, "\n=== LCI stage profile: %s ===\n", dir);
+    fprintf(stderr, "files=%zu  bytes=%.1f MB  symbols=%zu  refs=%zu\n",
+            stats.size(), tot_bytes / 1e6, tot_syms, tot_refs);
+    fprintf(stderr, "read     : %8.1f ms\n", tot_read);
+    fprintf(stderr, "parse(TS): %8.1f ms  (%.1f%% of cpu)\n", tot_parse,
+            100 * tot_parse / cpu);
+    fprintf(stderr, "extract  : %8.1f ms  (%.1f%% of cpu)\n", tot_extract,
+            100 * tot_extract / cpu);
+    fprintf(stderr, "trigram  : %8.1f ms  (%.1f%% of cpu)\n", tot_trigram,
+            100 * tot_trigram / cpu);
+    fprintf(stderr, "throughput: %.1f MB/s parse, %.0f files/s\n",
+            tot_bytes / 1e6 / (tot_parse / 1000),
+            stats.size() / (cpu / 1000));
+
+    std::sort(stats.begin(), stats.end(), [](const auto& a, const auto& b) {
+        return (a.parse_ms + a.extract_ms) > (b.parse_ms + b.extract_ms);
+    });
+    fprintf(stderr, "--- 12 slowest files (parse+extract) ---\n");
+    for (size_t i = 0; i < stats.size() && i < 12; ++i) {
+        const auto& s = stats[i];
+        fprintf(stderr, "  %7.1f ms  %6.0f KB  p=%5.1f x=%5.1f  %s\n",
+                s.parse_ms + s.extract_ms, s.bytes / 1024.0, s.parse_ms,
+                s.extract_ms, s.path.c_str());
+    }
+    fprintf(stderr, "--- by extension (parse+extract ms / file count) ---\n");
+    for (const auto& [e, pr] : by_ext)
+        fprintf(stderr, "  %-6s %8.1f ms / %d files\n", e.c_str(), pr.first,
+                pr.second);
+}
+
 }  // namespace
 }  // namespace lci::parser