Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[submodule "pylsp"]
path = pylsp
url = [email protected]:Hoblovski/python-lsp-server.git
branch = abc
12 changes: 10 additions & 2 deletions docs/uniast-en.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ Universal Abstract-Syntax-Tree is a LLM-friendly, language-agnostic code context

# Identity Node Unique Identification

To ensure precise querying and scalable storage, `ModPath?PkgPath#SymbolName` is约定 as the globally unique identifier for AST Nodes.

To ensure precise querying and scalable storage, `ModPath?PkgPath#SymbolName` is as the globally unique identifier for AST Nodes. For example:

```json
{
Expand All @@ -16,6 +15,15 @@ To ensure precise querying and scalable storage, `ModPath?PkgPath#SymbolName` is
}
```

> Note that different languages have different descriptions of module and package. For example:
> * In Go, a module refers to a project that contains multiple packages, and a package includes all the files within a specific directory.
> * In Python, a package is a directory, which may contain sub-packages. A package can also contain modules, which are .py files inside the package directory.
> * In Rust, the term package does not exist at all. Instead, a crate (project) contains multiple modules, and modules may include sub-modules.
> * In C, neither concept exists at all.
>
> Do not confuse them with the terminology used in abcoder!
> In abcoder, unless otherwise specified, the module (mod) and package (pkg) are defined as follows:

- ModPath: A complete build unit where the content is the installation path@version number. This information is not required for LLMs but is preserved to ensure global uniqueness of Identity. It corresponds to different concepts in various languages:

- <u>Golang</u>: Corresponds to a module, e.g., github.com/cloudwego/[email protected]
Expand Down
12 changes: 10 additions & 2 deletions docs/uniast-zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ Universal Abstract-Syntax-Tree 是 ABCoder 建立的一种 LLM 亲和、语言

# Identity 节点唯一标识

为了保证精确查询和可扩展存储,约定 `ModPath?PkgPath#SymbolName` 为 AST Node 的全球唯一标识。

为了保证精确查询和可扩展存储,约定 `ModPath?PkgPath#SymbolName` 为 AST Node 的全球唯一标识。例如:

```json
{
Expand All @@ -16,6 +15,15 @@ Universal Abstract-Syntax-Tree 是 ABCoder 建立的一种 LLM 亲和、语言
}
```

> 注意,不同的语言对 module 和 package 的描述不同,例如
> * 在 Go 中 module 表示一个项目,包含了若干 package。而 package 包含了某目录下的诸文件。
> * 在 Python 中则是,package 是一个目录,可能包含子 package。而且 package 也可能包含 module,是 package 目录下的 py 文件。
> * 在 Rust 中根本没有 package 的说法,而是 crate(项目)包含了诸 module。module 可能包含子 module。
> * 在 C 中就完全没有这两个东西。
>
> 不要把它们和 abcoder 的描述混淆!
> 在 abcoder 中,除非另外说明,module(mod) / package(pkg) 的含义如下。

- ModPath: 一个完整的构建单元,ModPath 内容为安装路径@版本号。该信息对于 LLM 并不需要,只是为了保证 Identity 的全球唯一性而保存。它在各个语言中对应不同概念:

- <u>Golang</u>: 对应 module,如 github.com/cloudwego/[email protected]
Expand Down
47 changes: 40 additions & 7 deletions lang/collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/cloudwego/abcoder/lang/cxx"
"github.com/cloudwego/abcoder/lang/log"
. "github.com/cloudwego/abcoder/lang/lsp"
"github.com/cloudwego/abcoder/lang/python"
"github.com/cloudwego/abcoder/lang/rust"
"github.com/cloudwego/abcoder/lang/uniast"
)
Expand Down Expand Up @@ -88,6 +89,8 @@ func switchSpec(l uniast.Language) LanguageSpec {
return &rust.RustSpec{}
case uniast.Cxx:
return &cxx.CxxSpec{}
case uniast.Python:
return &python.PythonSpec{}
default:
panic(fmt.Sprintf("unsupported language %s", l))
}
Expand All @@ -110,7 +113,30 @@ func NewCollector(repo string, cli *LSPClient) *Collector {
return ret
}

func (c *Collector) configureLSP(ctx context.Context) {
// XXX: should be put in language specification
if c.Language == uniast.Python {
if !c.NeedStdSymbol {
if c.Language == uniast.Python {
conf := map[string]interface{}{
"settings": map[string]interface{}{
"pylsp": map[string]interface{}{
"plugins": map[string]interface{}{
"jedi_definition": map[string]interface{}{
"follow_builtin_definitions": false,
},
},
},
},
}
c.cli.Notify(ctx, "workspace/didChangeConfiguration", conf)
}
}
}
}

func (c *Collector) Collect(ctx context.Context) error {
c.configureLSP(ctx)
excludes := make([]string, len(c.Excludes))
for i, e := range c.Excludes {
if !filepath.IsAbs(e) {
Expand All @@ -121,7 +147,7 @@ func (c *Collector) Collect(ctx context.Context) error {
}

// scan all files
roots := make([]*DocumentSymbol, 0, 1024)
root_syms := make([]*DocumentSymbol, 0, 1024)
scanner := func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
Expand Down Expand Up @@ -169,6 +195,11 @@ func (c *Collector) Collect(ctx context.Context) error {
if err != nil {
return err
}
// HACK: skip imported symbols (do not expose imported symbols in Python)
// TODO: make this behavior consistent in python and rust (where we have pub use vs use)
if c.Language == uniast.Python && (strings.HasPrefix(content, "from ") || strings.HasPrefix(content, "import ")) {
continue
}
// collect tokens
tokens, err := c.cli.SemanticTokens(ctx, sym.Location)
if err != nil {
Expand All @@ -177,7 +208,7 @@ func (c *Collector) Collect(ctx context.Context) error {
sym.Text = content
sym.Tokens = tokens
c.syms[sym.Location] = sym
roots = append(roots, sym)
root_syms = append(root_syms, sym)
}

return nil
Expand All @@ -187,11 +218,11 @@ func (c *Collector) Collect(ctx context.Context) error {
}

// collect some extra metadata
syms := make([]*DocumentSymbol, 0, len(roots))
for _, sym := range roots {
entity_syms := make([]*DocumentSymbol, 0, len(root_syms))
for _, sym := range root_syms {
// only language entity symbols need to be collect on next
if c.spec.IsEntitySymbol(*sym) {
syms = append(syms, sym)
entity_syms = append(entity_syms, sym)
}
c.processSymbol(ctx, sym, 1)
}
Expand Down Expand Up @@ -229,7 +260,7 @@ func (c *Collector) Collect(ctx context.Context) error {
// }

// collect dependencies
for _, sym := range syms {
for _, sym := range entity_syms {
next_token:

for i, token := range sym.Tokens {
Expand Down Expand Up @@ -572,11 +603,13 @@ func (c *Collector) collectImpl(ctx context.Context, sym *DocumentSymbol, depth
}
}
var impl string
// HACK: impl head for Rust.
if fn > 0 && fn < len(sym.Tokens) {
impl = ChunkHead(sym.Text, sym.Location.Range.Start, sym.Tokens[fn].Location.Range.Start)
}
// HACK: implhead for Python. Should actually be provided by the language spec.
if impl == "" || len(impl) < len(sym.Name) {
impl = sym.Name
impl = fmt.Sprintf("class %s {\n", sym.Name)
}
// search all methods
for _, method := range c.syms {
Expand Down
13 changes: 6 additions & 7 deletions lang/lsp/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ import (
type LSPClient struct {
*jsonrpc2.Conn
*lspHandler
tokenTypes []string
tokenModifiers []string
files map[DocumentURI]*TextDocumentItem
tokenTypes []string
tokenModifiers []string
hasSemanticTokensRange bool
files map[DocumentURI]*TextDocumentItem
ClientOptions
}

Expand Down Expand Up @@ -156,10 +157,6 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI,
return nil, fmt.Errorf("server did not provide TypeDefinition")
}

implementationProvider, ok := vs["implementationProvider"].(bool)
if !ok || !implementationProvider {
return nil, fmt.Errorf("server did not provide Implementation")
}
documentSymbolProvider, ok := vs["documentSymbolProvider"].(bool)
if !ok || !documentSymbolProvider {
return nil, fmt.Errorf("server did not provide DocumentSymbol")
Expand All @@ -174,6 +171,8 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI,
if !ok || semanticTokensProvider == nil {
return nil, fmt.Errorf("server did not provide SemanticTokensProvider")
}
semanticTokensRange, ok := semanticTokensProvider["range"].(bool)
cli.hasSemanticTokensRange = ok && semanticTokensRange
legend, ok := semanticTokensProvider["legend"].(map[string]interface{})
if !ok || legend == nil {
return nil, fmt.Errorf("server did not provide SemanticTokensProvider.legend")
Expand Down
28 changes: 14 additions & 14 deletions lang/lsp/lsp.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"sort"
"strings"

"github.com/cloudwego/abcoder/lang/uniast"
"github.com/cloudwego/abcoder/lang/utils"
"github.com/sourcegraph/go-lsp"
)
Expand Down Expand Up @@ -285,22 +284,23 @@ func (cli *LSPClient) References(ctx context.Context, id Location) ([]Location,
return resp, nil
}

// TODO(perf): cache results especially for whole file queries.
// TODO(refactor): infer use_full_method from capabilities
func (cli *LSPClient) getSemanticTokensRange(ctx context.Context, req DocumentRange, resp *SemanticTokens, use_full_method bool) error {
if use_full_method {
req1 := struct {
TextDocument lsp.TextDocumentIdentifier `json:"textDocument"`
}{TextDocument: req.TextDocument}
if err := cli.Call(ctx, "textDocument/semanticTokens/full", req1, resp); err != nil {
return err
}
filterSemanticTokensInRange(resp, req.Range)
} else {
// Some language servers do not provide semanticTokens/range.
// In that case, we fall back to semanticTokens/full and then filter the tokens manually.
func (cli *LSPClient) getSemanticTokensRange(ctx context.Context, req DocumentRange, resp *SemanticTokens) error {
if cli.hasSemanticTokensRange {
if err := cli.Call(ctx, "textDocument/semanticTokens/range", req, resp); err != nil {
return err
}
return nil
}
// fall back to semanticTokens/full
req1 := struct {
TextDocument lsp.TextDocumentIdentifier `json:"textDocument"`
}{TextDocument: req.TextDocument}
if err := cli.Call(ctx, "textDocument/semanticTokens/full", req1, resp); err != nil {
return err
}
filterSemanticTokensInRange(resp, req.Range)
return nil
}

Expand Down Expand Up @@ -355,7 +355,7 @@ func (cli *LSPClient) SemanticTokens(ctx context.Context, id Location) ([]Token,
}

var resp SemanticTokens
if err := cli.getSemanticTokensRange(ctx, req, &resp, cli.Language == uniast.Cxx); err != nil {
if err := cli.getSemanticTokensRange(ctx, req, &resp); err != nil {
return nil, err
}

Expand Down
5 changes: 5 additions & 0 deletions lang/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/cloudwego/abcoder/lang/golang/parser"
"github.com/cloudwego/abcoder/lang/log"
"github.com/cloudwego/abcoder/lang/lsp"
"github.com/cloudwego/abcoder/lang/python"
"github.com/cloudwego/abcoder/lang/rust"
"github.com/cloudwego/abcoder/lang/uniast"
)
Expand Down Expand Up @@ -106,6 +107,8 @@ func checkRepoPath(repoPath string, language uniast.Language) (openfile string,
openfile, wait = rust.CheckRepo(repoPath)
case uniast.Cxx:
openfile, wait = cxx.CheckRepo(repoPath)
case uniast.Python:
openfile, wait = python.CheckRepo(repoPath)
default:
openfile = ""
wait = 0
Expand All @@ -121,6 +124,8 @@ func checkLSP(language uniast.Language, lspPath string) (l uniast.Language, s st
l, s = rust.GetDefaultLSP()
case uniast.Cxx:
l, s = cxx.GetDefaultLSP()
case uniast.Python:
l, s = python.GetDefaultLSP()
case uniast.Golang:
l = uniast.Golang
s = ""
Expand Down
42 changes: 42 additions & 0 deletions lang/python/lib.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2025 CloudWeGo Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package python

import (
"time"

"github.com/cloudwego/abcoder/lang/uniast"
"github.com/cloudwego/abcoder/lang/utils"
)

const MaxWaitDuration = 5 * time.Second

func GetDefaultLSP() (lang uniast.Language, name string) {
// Use custom PyLSP.
return uniast.Python, "pylsp"
}

func CheckRepo(repo string) (string, time.Duration) {
openfile := ""
// TODO: check if the project compiles.

// NOTICE: wait for Rust projects based on code files
_, size := utils.CountFiles(repo, ".py", "SKIPDIR")
wait := 2*time.Second + time.Second*time.Duration(size/1024)
if wait > MaxWaitDuration {
wait = MaxWaitDuration
}
return openfile, wait
}
Loading