diff --git a/.gitignore b/.gitignore index 7628b0e..5983f43 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Generated artifacts codeindex.json +symbolindex.json repo_graph.json # Python @@ -24,4 +25,4 @@ env/ # IDE .idea/ .vscode/ -*.swp +*.swp \ No newline at end of file diff --git a/codeindex/analyze.py b/codeindex/analyze.py index 3191f2a..efcced5 100644 --- a/codeindex/analyze.py +++ b/codeindex/analyze.py @@ -6,7 +6,7 @@ from codeindex.analyzers import ( python_analyzer, js_analyzer, css_analyzer, go_analyzer, ruby_analyzer, rust_analyzer, java_analyzer, php_analyzer, - docker_analyzer, ci_analyzer, schema_analyzer, + docker_analyzer, ci_analyzer, schema_analyzer, terraform_analyzer, ) from codeindex.analyzers.cross_lang_analyzer import find_api_boundaries from codeindex.analyzers.monorepo_analyzer import detect_workspaces, assign_packages @@ -51,6 +51,8 @@ def detect_languages(root: Path) -> list: langs.append("ci") if _any_match(root, ["*.sql", "*.prisma"]): langs.append("schema") + if _any_match(root, ["*.tf"]): + langs.append("terraform") return langs @@ -62,7 +64,7 @@ def merge_links(target: dict, source: dict) -> None: _INFRA_TYPES = {"service", "pipeline", "database"} _FRONTEND_LANGS = {"javascript", "typescript", "vue", "css", "scss", "less", "sass"} _BACKEND_LANGS = {"python", "go", "ruby", "rust", "java", "kotlin", "php"} -_INFRA_LANGS = {"docker", "github-actions", "gitlab-ci", "sql", "prisma"} +_INFRA_LANGS = {"docker", "github-actions", "gitlab-ci", "sql", "prisma", "terraform"} _ANALYZERS = { "python": python_analyzer, @@ -76,6 +78,7 @@ def merge_links(target: dict, source: dict) -> None: "docker": docker_analyzer, "ci": ci_analyzer, "schema": schema_analyzer, + "terraform": terraform_analyzer, } diff --git a/codeindex/analyzers/terraform_analyzer.py b/codeindex/analyzers/terraform_analyzer.py new file mode 100644 index 0000000..fb8fe72 --- /dev/null +++ b/codeindex/analyzers/terraform_analyzer.py @@ -0,0 +1,361 @@ +"""Terraform / HCL analyzer. + +Produces file-level nodes from .tf files. +Node types: + "config" — variables, providers, versions, backend files + "service" — files defining resources or data sources + "module" — files inside a modules/ directory or containing only module calls +Links: + local module sources, cross-resource references, provider imports. +""" +import re +from pathlib import Path +from typing import Optional + +from .base import load_gitignore_patterns, is_ignored, is_skip_dir, dir_group + +# ── Regexes ──────────────────────────────────────────────────────────────────── +_RESOURCE_RE = re.compile(r'^resource\s+"([^"]+)"\s+"([^"]+)"', re.MULTILINE) +_DATA_RE = re.compile(r'^data\s+"([^"]+)"\s+"([^"]+)"', re.MULTILINE) +_MODULE_RE = re.compile(r'^module\s+"([^"]+)"', re.MULTILINE) +_PROVIDER_RE = re.compile(r'^provider\s+"([^"]+)"', re.MULTILINE) +_SOURCE_ATTR_RE = re.compile(r'\bsource\s*=\s*"([^"]+)"') + +# required_providers { provider = { ... } } — one level of nesting +_REQ_PROVIDERS_RE = re.compile( + r'required_providers\s*\{((?:[^{}]|\{[^{}]*\})*)\}', + re.DOTALL, +) +# Match provider entries (name = { ... }) but not scalar attrs (name = "...") +_PROV_ENTRY_RE = re.compile(r'^\s+(\w+)\s*=\s*\{', re.MULTILINE) + +# Generic resource reference scan: aws_s3_bucket.my_bucket +# Requires at least one underscore in the type name to filter out var/local/module/data +_RES_REF_RE = re.compile(r'\b([a-z][a-z0-9]*(?:_[a-z0-9]+)+)\.([a-zA-Z0-9_]+)\b') +# Data reference: data.type.name +_DATA_REF_RE = re.compile(r'\bdata\.([a-z][a-z0-9_]*)\.([a-zA-Z0-9_]+)\b') +# Variable definition and reference: variable "name" / var.name +_VAR_DEF_RE = re.compile(r'^variable\s+"([^"]+)"', re.MULTILINE) +_VAR_REF_RE = re.compile(r'\bvar\.([a-zA-Z0-9_]+)\b') +# Locals block parsing and reference +_LOCALS_BLOCK_RE = re.compile( + r'^locals\s*\{((?:[^{}]|\{[^{}]*\})*)\}', + re.MULTILINE | re.DOTALL, +) +_LOCAL_KEY_RE = re.compile(r'^\s+(\w+)\s*=', re.MULTILINE) +_LOCAL_NESTED_RE = re.compile(r'\{[^{}]*\}') # strip nested objects before key scan +_LOCAL_REF_RE = re.compile(r'\blocal\.([a-zA-Z0-9_]+)\b') +# Output definition +_OUTPUT_DEF_RE = re.compile(r'^output\s+"([^"]+)"', re.MULTILINE) +# Module output reference: module.name.attr — 3-part cross-dir lookup +_MOD_OUTPUT_REF_RE = re.compile(r'\bmodule\.([a-zA-Z0-9_]+)\.([a-zA-Z0-9_]+)\b') +# Module block reference (2-part): links to file declaring module "name" in same dir +_MOD_REF_RE = re.compile(r'\bmodule\.([a-zA-Z0-9_]+)\b') + +_CONFIG_STEMS = { + "variables", "vars", "versions", "providers", + "terraform", "backend", "locals", "settings", +} +_MODULE_DIRS = {"modules", "module"} + +# Strip common cloud/vendor prefixes when matching module dir name to a file stem +_VENDOR_PREFIX_RE = re.compile(r'^(?:gcp|aws|azure|google|terraform)-') + + +def collect_files(root: Path, patterns: list) -> list[Path]: + files = [] + for p in root.rglob("*.tf"): + if is_skip_dir(p) or is_ignored(p, root, patterns): + continue + files.append(p) + return sorted(files) + + +def node_type(path: Path, has_resources: bool, has_data: bool) -> str: + stem = path.stem.lower() + parts = [p.lower() for p in path.parts] + if stem in _CONFIG_STEMS: + return "config" + if any(p in _MODULE_DIRS for p in parts[:-1]): + return "module" + if has_resources or has_data: + return "service" + return "module" + + +def _module_sources(source: str) -> list[tuple[str, str]]: + """Return (module_name, source_value) for all module blocks in source.""" + results = [] + for m in _MODULE_RE.finditer(source): + window = source[m.start(): m.start() + 600] + src_m = _SOURCE_ATTR_RE.search(window) + if src_m: + results.append((m.group(1), src_m.group(1))) + return results + + +def _provider_names(source: str) -> set[str]: + providers: set[str] = set() + for m in _PROVIDER_RE.finditer(source): + providers.add(m.group(1)) + for block in _REQ_PROVIDERS_RE.finditer(source): + for entry in _PROV_ENTRY_RE.finditer(block.group(1)): + providers.add(entry.group(1)) + return providers + + +def _add_link( + src: str, + tgt: str, + seen: set[tuple[str, str]], + links_map: dict, +) -> None: + key = (src, tgt) + if key not in seen: + seen.add(key) + links_map[key] = 1 + + +def _group_by_dir(tf_files: list[Path], root: Path) -> dict[str, list[Path]]: + """Group .tf files by their parent directory (= one Terraform module each).""" + dirs: dict[str, list[Path]] = {} + for f in tf_files: + key = str(f.parent.relative_to(root)) + dirs.setdefault(key, []).append(f) + return dirs + + +def analyze(root: Path, group_map: dict): + patterns = load_gitignore_patterns(root) + tf_files = collect_files(root, patterns) + + if not tf_files: + return [], [], {}, {"total_files": 0, "total_loc": 0} + + all_rel = {str(f.relative_to(root)) for f in tf_files} + # Every directory that contains at least one .tf file is a module entry-point + module_dirs = {str(f.parent.relative_to(root)) for f in tf_files} + + # Pre-index: module_dir → sorted list of direct .tf file rels in that dir. + # Used to pick the canonical entry-point file when resolving module sources. + _dir_files_index: dict[str, list[str]] = {} + for rel in sorted(all_rel): + parent = str(Path(rel).parent) + _dir_files_index.setdefault(parent, []).append(rel) + + def _module_entry(target_dir: str) -> Optional[str]: + """Return the canonical .tf file to link to for a module directory.""" + candidates = _dir_files_index.get(target_dir, []) + if not candidates: + return None + main = f"{target_dir}/main.tf" + if main in all_rel: + return main + # Try a file whose stem matches the dir name (stripping vendor prefixes) + dir_stem = _VENDOR_PREFIX_RE.sub("", Path(target_dir).name).replace("-", "_") + for c in candidates: + if Path(c).stem == dir_stem: + return c + # Prefer any non-config file over variables/outputs/providers/etc. + for c in candidates: + if Path(c).stem not in _CONFIG_STEMS: + return c + return candidates[0] + + nodes = [] + links_map = {} + ext_nodes = {} + total_loc = 0 + global_output_map: dict[tuple[str, str], str] = {} + + # ── Phase 1 (all dirs): parse files, build LOCAL maps, populate global_output_map ── + # Resources and data sources are scoped to their directory. Two files in + # different directories may define identically-named resources without + # any relationship between them. + per_dir: dict[str, dict] = {} + + for _module_dir, dir_files in _group_by_dir(tf_files, root).items(): + file_data = {} # rel → metadata + resource_map = {} # (type, name) → rel — scoped to this module dir + data_map = {} # (type, name) → rel — scoped to this module dir + var_map = {} # var_name → rel — scoped to this module dir + local_map = {} # local_name → rel — scoped to this module dir + module_instance_map = {} # module_instance_name → rel — scoped to this module dir + + for f in sorted(dir_files): + rel = str(f.relative_to(root)) + try: + source = f.read_text(errors="replace") + except OSError: + continue + + resources = {(m.group(1), m.group(2)) for m in _RESOURCE_RE.finditer(source)} + data_srcs = {(m.group(1), m.group(2)) for m in _DATA_RE.finditer(source)} + var_defs = {m.group(1) for m in _VAR_DEF_RE.finditer(source)} + mod_instance = {m.group(1) for m in _MODULE_RE.finditer(source)} + local_defs: set[str] = set() + for blk in _LOCALS_BLOCK_RE.finditer(source): + flat = _LOCAL_NESTED_RE.sub('{}', blk.group(1)) + for km in _LOCAL_KEY_RE.finditer(flat): + local_defs.add(km.group(1)) + + for r in resources: + resource_map[r] = rel + for d in data_srcs: + data_map[d] = rel + for v in var_defs: + var_map[v] = rel + for lv in local_defs: + local_map[lv] = rel + for mi in mod_instance: + module_instance_map[mi] = rel + + file_data[rel] = { + "source": source, + "resources": resources, + "data_sources": data_srcs, + "modules": _module_sources(source), + "providers": _provider_names(source), + "loc": source.count("\n") + 1, + } + + dir_rel = str(Path(rel).parent) + for om in _OUTPUT_DEF_RE.finditer(source): + global_output_map[(dir_rel, om.group(1))] = rel + + # Build module-instance → resolved target dir map for this module dir + # (used to resolve module.X.Y references to the output file in X's source dir) + module_src_resolved_map: dict[str, str] = {} + for rel_tmp, data_tmp in file_data.items(): + f_tmp = root / rel_tmp + for mod_name, mod_src in data_tmp["modules"]: + if mod_src.startswith(("./", "../")): + try: + target_dir = str( + (f_tmp.parent / mod_src).resolve().relative_to(root.resolve()) + ) + module_src_resolved_map[mod_name] = target_dir + except ValueError: + pass + + per_dir[_module_dir] = { + "file_data": file_data, + "resource_map": resource_map, + "data_map": data_map, + "var_map": var_map, + "local_map": local_map, + "module_instance_map": module_instance_map, + "module_src_resolved_map": module_src_resolved_map, + } + + # ── Phase 2 (all dirs): build nodes and edges (global_output_map fully populated) ── + for d in per_dir.values(): + file_data = d["file_data"] + resource_map = d["resource_map"] + data_map = d["data_map"] + var_map = d["var_map"] + local_map = d["local_map"] + module_instance_map = d["module_instance_map"] + module_src_resolved_map = d["module_src_resolved_map"] + + for rel, data in file_data.items(): + f = root / rel + loc = data["loc"] + total_loc += loc + source = data["source"] + + ntype = node_type(f, bool(data["resources"]), bool(data["data_sources"])) + seen_links: set[tuple[str, str]] = set() + + # Module source → cross-directory edges (the only legitimate cross-dir link) + for _mod_name, mod_src in data["modules"]: + if mod_src.startswith(("./", "../")): + try: + target_dir = str( + (f.parent / mod_src).resolve().relative_to(root.resolve()) + ) + except ValueError: + continue + if target_dir in module_dirs: + entry = _module_entry(target_dir) + if entry: + _add_link(rel, entry, seen_links, links_map) + else: + # Registry module (e.g. "terraform-aws-modules/vpc/aws") + ext_key = mod_src.split("//")[0].split("?")[0] + ext_nodes.setdefault(ext_key, { + "id": ext_key, + "type": "import", + "language": "terraform", + "size": 40, + "loc": 0, + "group": 9000, + "imports": 0, + }) + _add_link(rel, ext_key, seen_links, links_map) + + # Intra-module resource references — LOCAL map only, no cross-dir leakage + for m in _RES_REF_RE.finditer(source): + target_rel = resource_map.get((m.group(1), m.group(2))) + if target_rel and target_rel != rel: + _add_link(rel, target_rel, seen_links, links_map) + + # Intra-module data source references — same scoping rule + for m in _DATA_REF_RE.finditer(source): + target_rel = data_map.get((m.group(1), m.group(2))) + if target_rel and target_rel != rel: + _add_link(rel, target_rel, seen_links, links_map) + + # var.name references → link to file defining that variable + for m in _VAR_REF_RE.finditer(source): + target_rel = var_map.get(m.group(1)) + if target_rel and target_rel != rel: + _add_link(rel, target_rel, seen_links, links_map) + + # local.name references → link to file defining that local + for m in _LOCAL_REF_RE.finditer(source): + target_rel = local_map.get(m.group(1)) + if target_rel and target_rel != rel: + _add_link(rel, target_rel, seen_links, links_map) + + # module.X.attr — cross-dir link to the file defining output "attr" in X's source + for m in _MOD_OUTPUT_REF_RE.finditer(source): + target_dir = module_src_resolved_map.get(m.group(1)) + if target_dir: + output_file = global_output_map.get((target_dir, m.group(2))) + if output_file and output_file != rel: + _add_link(rel, output_file, seen_links, links_map) + + # module.X (2-part) — intra-dir link to the file declaring module "X" + for m in _MOD_REF_RE.finditer(source): + target_rel = module_instance_map.get(m.group(1)) + if target_rel and target_rel != rel: + _add_link(rel, target_rel, seen_links, links_map) + + # Provider external nodes + for prov in data["providers"]: + ext_nodes.setdefault(prov, { + "id": prov, + "type": "import", + "language": "terraform", + "size": 40, + "loc": 0, + "group": 9000, + "imports": 0, + }) + _add_link(rel, prov, seen_links, links_map) + + nodes.append({ + "id": rel, + "type": ntype, + "language": "terraform", + "size": loc, + "loc": loc, + "group": dir_group(f, root, group_map), + "imports": len(seen_links), + }) + + return nodes, list(ext_nodes.values()), links_map, { + "total_files": len(tf_files), + "total_loc": total_loc, + } diff --git a/codeindex/symbol_extractor.py b/codeindex/symbol_extractor.py index a827448..0b36b63 100644 --- a/codeindex/symbol_extractor.py +++ b/codeindex/symbol_extractor.py @@ -317,6 +317,47 @@ def extract_ruby(path: Path) -> list[dict]: return symbols +# ── Terraform / HCL ────────────────────────────────────────────────────────── + +_TF_RESOURCE_RE = re.compile(r'^resource\s+"([^"]+)"\s+"([^"]+)"', re.MULTILINE) +_TF_DATA_RE = re.compile(r'^data\s+"([^"]+)"\s+"([^"]+)"', re.MULTILINE) +_TF_MODULE_RE = re.compile(r'^module\s+"([^"]+)"', re.MULTILINE) +_TF_VAR_RE = re.compile(r'^variable\s+"([^"]+)"', re.MULTILINE) +_TF_OUTPUT_RE = re.compile(r'^output\s+"([^"]+)"', re.MULTILINE) +_TF_PROVIDER_RE = re.compile(r'^provider\s+"([^"]+)"', re.MULTILINE) + + +def extract_terraform(path: Path) -> list[dict]: + try: + source = path.read_text(errors="replace") + except OSError: + return [] + + symbols: list[dict] = [] + seen: set[str] = set() + + for pat, kind, fmt, exported in ( + (_TF_RESOURCE_RE, "resource", "{0}.{1}", True), + (_TF_DATA_RE, "data", "data.{0}.{1}", True), + (_TF_MODULE_RE, "module", "module.{0}", True), + (_TF_VAR_RE, "variable", "var.{0}", True), + (_TF_OUTPUT_RE, "output", "{0}", True), + (_TF_PROVIDER_RE, "provider", "provider.{0}", False), + ): + for m in pat.finditer(source): + name = fmt.format(*m.groups()) + if name and name not in seen: + seen.add(name) + symbols.append({ + "name": name, + "line": _line_of(source, m.start()), + "kind": kind, + "exported": exported, + }) + + return symbols + + # ── Dispatch ───────────────────────────────────────────────────────────────── EXTRACTORS: dict[str, callable] = { @@ -335,6 +376,7 @@ def extract_ruby(path: Path) -> list[dict]: ".rs": extract_rust, ".php": extract_php, ".rb": extract_ruby, + ".tf": extract_terraform, } diff --git a/codeindex/symbols.py b/codeindex/symbols.py index 81fba0e..dd277a8 100644 --- a/codeindex/symbols.py +++ b/codeindex/symbols.py @@ -25,6 +25,11 @@ "type": "ty", "const": "co", "module": "mod", + "resource": "res", + "data": "dat", + "variable": "var", + "output": "out", + "provider": "prv", } diff --git a/viz/explorer.html b/viz/explorer.html index 94fe565..6dc87f5 100644 --- a/viz/explorer.html +++ b/viz/explorer.html @@ -337,6 +337,7 @@ z-index: 9999; display: none; max-width: 220px; + overflow-wrap: break-word; box-shadow: 0 0 24px rgba(0,212,255,0.2); } @@ -1174,8 +1175,15 @@

Keyboard Shortcuts

} function hideTooltip() { tooltip.style.display = 'none'; } document.addEventListener('mousemove', e => { - tooltip.style.left = (e.clientX + 14) + 'px'; - tooltip.style.top = (e.clientY - 10) + 'px'; + const tw = tooltip.offsetWidth; + const th = tooltip.offsetHeight; + const margin = 8; + let left = e.clientX + 14; + let top = e.clientY - 10; + if (left + tw + margin > window.innerWidth) left = e.clientX - tw - 14; + if (top + th + margin > window.innerHeight) top = e.clientY - th - 10; + tooltip.style.left = left + 'px'; + tooltip.style.top = top + 'px'; }); function showNodeDetail(node) {