Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,17 @@ Testing/
# dartai loop state (machine-local)
.dartai/

# Real projects test data (git submodules)
# Real projects test data (fetched via scripts/add-real-projects.sh)
real_projects/go/
real_projects/python/
real_projects/typescript/
real_projects/java/
real_projects/csharp/
real_projects/rust/
real_projects/php/
real_projects/kotlin/
real_projects/ruby/
real_projects/zig/

# worktrack loop state (per-clone, may contain private email in lease_holder)
.worktrack/
5 changes: 5 additions & 0 deletions include/lci/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ struct ProjectConfig {

struct IndexConfig {
int64_t max_file_size = 10 * 1024 * 1024; // 10 MB
// Files larger than this are still trigram-indexed for text search but skip
// the tree-sitter parse + symbol extraction: a multi-MB source file is
// almost always generated/minified, where the parse cost (parse is ~58% of
// index CPU) buys little symbol value. 0 disables the cap.
int64_t max_parse_file_size = 2 * 1024 * 1024; // 2 MB
int64_t max_total_size_mb = 500;
int max_file_count = 10000;
bool follow_symlinks = false;
Expand Down
1 change: 1 addition & 0 deletions include/lci/indexing/pipeline_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ struct ProcessedFile {
std::chrono::nanoseconds duration{};
Error error{};
bool has_error{};
bool parse_skipped_oversize{}; // trigram-indexed, tree-sitter skipped
};

/// Pipeline buffer size constants.
Expand Down
11 changes: 9 additions & 2 deletions real_projects/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ git submodule update --recursive --remote
Or use the provided setup script:

```bash
./scripts/add-real-projects.sh --minimal # 2 projects (fast)
./scripts/add-real-projects.sh --full # All 9 projects
./scripts/add-real-projects.sh --minimal # 3 projects (fast: Go/Python/TS)
./scripts/add-real-projects.sh --full # All 16 projects (covers 13 languages)
```

## Directory Structure
Expand All @@ -48,6 +48,13 @@ real_projects/
│ ├── next.js/ # Next.js React framework
│ ├── shadcn-ui/ # UI component library
│ └── trpc/ # TypeScript RPC framework
├── java/gson/ # Call-graph corpora for the scope-type-resolution
├── csharp/serilog/ # languages (each had no call graph before that work).
├── rust/ripgrep/ # real_project_languages_test.cpp asserts receiver-
├── php/guzzle/ # type resolution fires on each of these real repos.
├── kotlin/okhttp/
├── ruby/sinatra/
├── zig/zls/
└── README.md # This file
```

Expand Down
12 changes: 11 additions & 1 deletion scripts/add-real-projects.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ PROJECTS=(
"typescript:next.js:https://github.com/vercel/next.js.git"
"typescript:shadcn-ui:https://github.com/shadcn-ui/ui.git"
"typescript:trpc:https://github.com/trpc/trpc.git"
# Call-graph corpora for the scope-type-resolution languages. Each had no
# call graph before that work; real_project_languages_test.cpp asserts
# receiver-type resolution fires on these real repos.
"java:gson:https://github.com/google/gson.git"
"csharp:serilog:https://github.com/serilog/serilog.git"
"rust:ripgrep:https://github.com/BurntSushi/ripgrep.git"
"php:guzzle:https://github.com/guzzle/guzzle.git"
"kotlin:okhttp:https://github.com/square/okhttp.git"
"ruby:sinatra:https://github.com/sinatra/sinatra.git"
"zig:zls:https://github.com/zigtools/zls.git"
)

# Minimal projects for initial setup. trpc keeps the TS surface
Expand Down Expand Up @@ -90,7 +100,7 @@ fi

# Create real_projects directories
log "Creating directory structure..."
mkdir -p real_projects/{go,python,typescript}
mkdir -p real_projects/{go,python,typescript,java,csharp,rust,php,kotlin,ruby,zig}

ADDED=0
FAILED=0
Expand Down
8 changes: 8 additions & 0 deletions src/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,14 @@ void apply_index(Config& cfg, const KdlNode& node) {
int v = 0;
if (get_int(child, v)) cfg.index.max_file_size = v;
}
} else if (child.name == "max_parse_file_size") {
std::string sz;
if (get_string(child, sz)) {
cfg.index.max_parse_file_size = parse_size_string(sz);
} else {
int v = 0;
if (get_int(child, v)) cfg.index.max_parse_file_size = v;
}
} else if (child.name == "max_total_size_mb") {
int v = 0;
if (get_int(child, v)) cfg.index.max_total_size_mb = v;
Expand Down
15 changes: 13 additions & 2 deletions src/indexing/pipeline_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ namespace {
/// that trigram/postings indexing still proceeds.
void run_unified_extraction(ProcessedFile& result,
std::string_view content,
const std::string& path) {
const std::string& path,
int64_t max_parse_bytes) {
auto ext = std::filesystem::path(path).extension().string();
if (ext.empty()) return;

Expand All @@ -32,6 +33,15 @@ void run_unified_extraction(ProcessedFile& result,
return; // Unsupported language: trigrams still index for text search.
}

// Oversized source: skip the tree-sitter parse (the expensive stage) but
// keep trigram text indexing. Surface the skip rather than silently
// dropping symbols for the file.
if (max_parse_bytes > 0 &&
static_cast<int64_t>(content.size()) > max_parse_bytes) {
result.parse_skipped_oversize = true;
return;
}

parser::PooledParser parser_guard(lang);
if (!parser_guard) return;

Expand Down Expand Up @@ -182,7 +192,8 @@ ProcessedFile FileProcessor::process_file(int /*worker_id*/,
// tree-sitter. This populates the symbol-aware data the integrator
// feeds into ReferenceTracker. Without this step, browse-file,
// list-symbols, references, and tree endpoints all return empty.
run_unified_extraction(result, content, task.path);
run_unified_extraction(result, content, task.path,
config_.index.max_parse_file_size);

// Bucket trigrams during processing (zero-lock per-file)
if (trigram_index_ != nullptr && content.size() >= 3) {
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ add_executable(lci_real_project_tests
integration/real_project_side_effects_test.cpp
integration/real_project_typescript_test.cpp
integration/real_project_feature_audit_test.cpp
integration/real_project_languages_test.cpp
integration/spec_runner.cpp
)
target_include_directories(lci_real_project_tests PRIVATE
Expand Down
91 changes: 91 additions & 0 deletions tests/integration/real_project_languages_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Real-project call-graph resolution across all newly-wired languages.
//
// Each of these seven languages had NO call graph before scope-based
// receiver-type resolution was added. These tests index a real upstream repo
// per language and assert that (a) symbols are extracted and (b) a known
// method call resolves to a receiver-type-qualified target (`Type.method`),
// i.e. the SCIP-base-case resolution fires on real code, not just the
// controlled corpora in the unit suite.
//
// Skips gracefully when the corpus is absent (run ./scripts/add-real-projects.sh).

#include <gtest/gtest.h>

#include <lci/config.h>
#include <lci/core/reference_tracker.h>
#include <lci/indexing/master_index.h>

#include <string>
#include <vector>

#include "helpers/real_project_helpers.h"

namespace lci {
namespace {

// Indexes <lang>/<repo>, asserts symbols were extracted, then confirms the
// sentinel method's callee set contains the expected receiver-type-qualified
// edge — proving type resolution works for the language on real source.
void expect_qualified_callee(const std::string& lang, const std::string& repo,
const std::string& sentinel,
const std::string& qualified_callee) {
auto path = testing::find_real_project(lang, repo);
if (!path) {
GTEST_SKIP() << "Real project not found: " << lang << "/" << repo
<< ". Run ./scripts/add-real-projects.sh";
}
auto ctx = testing::setup_real_project(*path, repo);
ASSERT_TRUE(ctx.valid()) << "Failed to index " << lang << "/" << repo;
EXPECT_GT(ctx.indexer->file_count(), 0);

const auto& rt = ctx.indexer->ref_tracker();
auto syms = rt.find_symbols_by_name(sentinel);
ASSERT_FALSE(syms.empty())
<< "sentinel symbol '" << sentinel << "' not extracted in " << repo;

bool found = false;
for (const auto* s : syms) {
for (const auto& callee : rt.get_callee_names(s->id)) {
if (callee == qualified_callee) {
found = true;
break;
}
}
if (found) break;
}
EXPECT_TRUE(found) << lang << "/" << repo << ": " << sentinel
<< " should have a receiver-type-qualified callee '"
<< qualified_callee << "'";
}

TEST(RealProjectLanguages, JavaGsonResolvesReceiverType) {
expect_qualified_callee("java", "gson", "toJson", "Gson.toJson");
}

TEST(RealProjectLanguages, CSharpSerilogResolvesReceiverType) {
expect_qualified_callee("csharp", "serilog", "Write", "Logger.IsEnabled");
}

TEST(RealProjectLanguages, RustRipgrepResolvesReceiverType) {
expect_qualified_callee("rust", "ripgrep", "build", "GlobSetBuilder.add");
}

TEST(RealProjectLanguages, PhpGuzzleResolvesReceiverType) {
expect_qualified_callee("php", "guzzle", "send", "Client.sendAsync");
}

TEST(RealProjectLanguages, KotlinOkhttpResolvesReceiverType) {
expect_qualified_callee("kotlin", "okhttp", "intercept", "Chain.request");
}

TEST(RealProjectLanguages, RubySinatraResolvesReceiverType) {
expect_qualified_callee("ruby", "sinatra", "call", "ExtendedRack.setup_close");
}

TEST(RealProjectLanguages, ZigZlsResolvesReceiverType) {
expect_qualified_callee("zig", "zls", "resolveTypeOfNode",
"Analyser.resolveBindingOfNode");
}

} // namespace
} // namespace lci
133 changes: 133 additions & 0 deletions tests/language_extraction_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,17 @@
#include <gtest/gtest.h>
#include <tree_sitter/api.h>

#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <map>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>

namespace lci::parser {
namespace {
Expand Down Expand Up @@ -987,5 +996,129 @@ fn go() void {
EXPECT_TRUE(has_call_ref(r, "A.run")); // const a = A{}
}

// ---------------------------------------------------------------------------
// Stage-timing profiler (manual; set LCI_PROFILE_DIR=<path> to run).
// Walks the tree, and for each known-language source file times tree-sitter
// parse vs. UnifiedExtractor extraction vs. a representative trigram pass.
// Reports the per-stage totals, throughput, and the slowest single files so a
// pathological input (huge minified/generated source) is named, not hidden.
// ---------------------------------------------------------------------------
TEST(IndexProfile, StageBreakdown) {
const char* dir = std::getenv("LCI_PROFILE_DIR");
if (dir == nullptr) GTEST_SKIP() << "set LCI_PROFILE_DIR to run";

namespace fs = std::filesystem;
using clk = std::chrono::steady_clock;
auto ms = [](clk::duration d) {
return std::chrono::duration<double, std::milli>(d).count();
};

struct FileStat {
std::string path;
size_t bytes = 0;
double parse_ms = 0, extract_ms = 0, trigram_ms = 0;
size_t symbols = 0, refs = 0;
};
std::vector<FileStat> stats;
double tot_read = 0, tot_parse = 0, tot_extract = 0, tot_trigram = 0;
size_t tot_bytes = 0, tot_syms = 0, tot_refs = 0;

std::error_code ec;
for (auto it = fs::recursive_directory_iterator(
dir, fs::directory_options::skip_permission_denied, ec);
it != fs::recursive_directory_iterator(); it.increment(ec)) {
if (ec) break;
if (!it->is_regular_file(ec) || ec) continue;
std::string p = it->path().string();
if (p.find("/.git/") != std::string::npos) continue;
std::string ext = it->path().extension().string();
Language lang{};
if (!language_from_extension(ext, lang)) continue;

auto t0 = clk::now();
std::ifstream f(p, std::ios::binary);
std::stringstream ss;
ss << f.rdbuf();
std::string content = ss.str();
auto t1 = clk::now();
if (content.empty()) continue;

auto tree = parse(lang, content);
auto t2 = clk::now();
if (!tree) continue;

UnifiedExtractor ue;
ue.init(content, 1, ext, p);
ue.extract(tree.get());
auto r = ue.get_results();
auto t3 = clk::now();

// Representative trigram pass: 3-byte sliding window into a set.
std::vector<uint32_t> tris;
tris.reserve(content.size());
const auto* b = reinterpret_cast<const uint8_t*>(content.data());
for (size_t i = 0; i + 2 < content.size(); ++i)
tris.push_back((uint32_t(b[i]) << 16) | (uint32_t(b[i + 1]) << 8) |
uint32_t(b[i + 2]));
std::sort(tris.begin(), tris.end());
tris.erase(std::unique(tris.begin(), tris.end()), tris.end());
auto t4 = clk::now();

FileStat fsr;
fsr.path = p;
fsr.bytes = content.size();
fsr.parse_ms = ms(t2 - t1);
fsr.extract_ms = ms(t3 - t2);
fsr.trigram_ms = ms(t4 - t3);
fsr.symbols = r.symbols.size();
fsr.refs = r.references.size();
tot_read += ms(t1 - t0);
tot_parse += fsr.parse_ms;
tot_extract += fsr.extract_ms;
tot_trigram += fsr.trigram_ms;
tot_bytes += content.size();
tot_syms += r.symbols.size();
tot_refs += r.references.size();
stats.push_back(std::move(fsr));
}

std::map<std::string, std::pair<double, int>> by_ext; // ext -> {parse,n}
for (const auto& s : stats) {
auto e = fs::path(s.path).extension().string();
by_ext[e].first += s.parse_ms + s.extract_ms;
by_ext[e].second++;
}

double cpu = tot_parse + tot_extract + tot_trigram;
fprintf(stderr, "\n=== LCI stage profile: %s ===\n", dir);
fprintf(stderr, "files=%zu bytes=%.1f MB symbols=%zu refs=%zu\n",
stats.size(), tot_bytes / 1e6, tot_syms, tot_refs);
fprintf(stderr, "read : %8.1f ms\n", tot_read);
fprintf(stderr, "parse(TS): %8.1f ms (%.1f%% of cpu)\n", tot_parse,
100 * tot_parse / cpu);
fprintf(stderr, "extract : %8.1f ms (%.1f%% of cpu)\n", tot_extract,
100 * tot_extract / cpu);
fprintf(stderr, "trigram : %8.1f ms (%.1f%% of cpu)\n", tot_trigram,
100 * tot_trigram / cpu);
fprintf(stderr, "throughput: %.1f MB/s parse, %.0f files/s\n",
tot_bytes / 1e6 / (tot_parse / 1000),
stats.size() / (cpu / 1000));

std::sort(stats.begin(), stats.end(), [](const auto& a, const auto& b) {
return (a.parse_ms + a.extract_ms) > (b.parse_ms + b.extract_ms);
});
fprintf(stderr, "--- 12 slowest files (parse+extract) ---\n");
for (size_t i = 0; i < stats.size() && i < 12; ++i) {
const auto& s = stats[i];
fprintf(stderr, " %7.1f ms %6.0f KB p=%5.1f x=%5.1f %s\n",
s.parse_ms + s.extract_ms, s.bytes / 1024.0, s.parse_ms,
s.extract_ms, s.path.c_str());
}
fprintf(stderr, "--- by extension (parse+extract ms / file count) ---\n");
for (const auto& [e, pr] : by_ext)
fprintf(stderr, " %-6s %8.1f ms / %d files\n", e.c_str(), pr.first,
pr.second);
}

} // namespace
} // namespace lci::parser
Loading