Files
archlens/crates/adapters/tree-sitter/src/python/mod.rs

368 lines
10 KiB
Rust

use tree_sitter::Node;
use archlens_domain::{CodeElement, CodeElementKind, Relationship, RelationshipKind};
use crate::extraction_context::ExtractionContext;
use crate::language_extractor::LanguageExtractor;
pub struct PythonExtractor;
impl LanguageExtractor for PythonExtractor {
fn tree_sitter_language(&self) -> tree_sitter::Language {
tree_sitter_python::LANGUAGE.into()
}
fn extract_types(&self, root: &Node, source: &str, ctx: &mut ExtractionContext) {
// collect_classes handles class elements, inheritance, and field compositions
// in a single pass — Python's relationship extraction is interleaved with type extraction
collect_classes(root, source, ctx);
}
fn extract_relationships(&self, _root: &Node, _source: &str, _ctx: &mut ExtractionContext) {
// Relationships are collected inside collect_classes for Python
}
fn extract_imports(&self, root: &Node, source: &str, ctx: &mut ExtractionContext) {
collect_imports(root, source, ctx);
}
}
fn collect_classes(node: &Node, source: &str, ctx: &mut ExtractionContext) {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() != "class_definition" {
continue;
}
let Some(name_node) = child.child_by_field_name("name") else {
continue;
};
let name = &source[name_node.byte_range()];
let line = child.start_position().row + 1;
let methods = child
.child_by_field_name("body")
.map(|body| collect_methods(&body, source))
.unwrap_or_default();
match CodeElement::new(name, CodeElementKind::Class, ctx.file_path().clone(), line) {
Ok(element) => ctx.add_element(element.with_methods(methods)),
Err(e) => {
ctx.add_warning(ctx.file_path().clone(), line, &e.to_string());
continue;
}
}
if let Some(superclasses) = child.child_by_field_name("superclasses") {
collect_inheritance(&superclasses, source, name, ctx);
}
if let Some(body) = child.child_by_field_name("body") {
collect_typed_fields(&body, source, name, ctx);
collect_constructor_params(&body, source, name, ctx);
}
}
}
fn collect_inheritance(
superclasses: &Node,
source: &str,
class_name: &str,
ctx: &mut ExtractionContext,
) {
let mut cursor = superclasses.walk();
for child in superclasses.children(&mut cursor) {
if child.kind() == "identifier" {
let base_name = &source[child.byte_range()];
if !is_python_builtin(base_name)
&& let Ok(rel) =
Relationship::new(class_name, base_name, RelationshipKind::Inheritance)
{
ctx.add_relationship(rel);
}
}
}
}
const PYTHON_BUILTINS: &[&str] = &[
"str",
"int",
"float",
"bool",
"bytes",
"list",
"dict",
"set",
"tuple",
"None",
"type",
"object",
"Exception",
"BaseException",
"Optional",
"Any",
"Union",
"List",
"Dict",
"Set",
"Tuple",
"Callable",
"Sequence",
"Mapping",
"Iterable",
"Iterator",
"Generator",
"Coroutine",
"AsyncGenerator",
"ClassVar",
"Final",
"Literal",
"TypeVar",
"Generic",
"Protocol",
"runtime_checkable",
"Self",
];
fn is_python_builtin(name: &str) -> bool {
PYTHON_BUILTINS.contains(&name)
}
const STDLIB_MODULES: &[&str] = &[
"os",
"sys",
"typing",
"logging",
"json",
"re",
"io",
"abc",
"collections",
"datetime",
"enum",
"functools",
"hashlib",
"http",
"importlib",
"inspect",
"itertools",
"math",
"pathlib",
"pickle",
"random",
"shutil",
"signal",
"socket",
"string",
"subprocess",
"tempfile",
"threading",
"time",
"traceback",
"unittest",
"urllib",
"uuid",
"warnings",
"contextlib",
"dataclasses",
"copy",
"struct",
"base64",
"csv",
"glob",
"operator",
"textwrap",
"asyncio",
"concurrent",
"multiprocessing",
];
fn is_external_import(module: &str) -> bool {
let top = module.split('.').next().unwrap_or(module);
if STDLIB_MODULES.contains(&top) {
return true;
}
if top.starts_with('_') {
return true;
}
false
}
fn collect_imports(node: &Node, source: &str, ctx: &mut ExtractionContext) {
let file_name = std::path::Path::new(ctx.file_path().as_str())
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown")
.to_string();
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
match child.kind() {
"import_statement" => {
let mut name_cursor = child.walk();
for name_child in child.children(&mut name_cursor) {
if name_child.kind() == "dotted_name" {
let module = &source[name_child.byte_range()];
if !is_external_import(module)
&& let Ok(rel) =
Relationship::new(&file_name, module, RelationshipKind::Import)
{
ctx.add_relationship(rel);
}
}
}
}
"import_from_statement" => {
if let Some(module_node) = child.child_by_field_name("module_name") {
let module = &source[module_node.byte_range()];
if !is_external_import(module)
&& let Ok(rel) =
Relationship::new(&file_name, module, RelationshipKind::Import)
{
ctx.add_relationship(rel);
}
}
}
_ => {}
}
}
}
fn collect_methods(body: &Node, source: &str) -> Vec<String> {
let mut methods = Vec::new();
let mut cursor = body.walk();
for child in body.children(&mut cursor) {
if child.kind() != "function_definition" {
continue;
}
let Some(name_node) = child.child_by_field_name("name") else {
continue;
};
let fn_name = &source[name_node.byte_range()];
if fn_name.starts_with('_') {
continue;
}
let params = child
.child_by_field_name("parameters")
.map(|p| extract_python_params(&p, source))
.unwrap_or_default();
let ret = child
.child_by_field_name("return_type")
.map(|n| source[n.byte_range()].trim().to_string())
.unwrap_or_default();
let sig = if ret.is_empty() {
format!("+{fn_name}({params})")
} else {
format!("+{fn_name}({params}) -> {ret}")
};
methods.push(sig);
}
methods
}
fn extract_python_params(params_node: &Node, source: &str) -> String {
let mut parts = Vec::new();
let mut cursor = params_node.walk();
for param in params_node.children(&mut cursor) {
match param.kind() {
"typed_parameter" => {
if let Some(type_node) = param.child_by_field_name("type") {
let mut inner = param.walk();
let name = param
.children(&mut inner)
.find(|c| c.kind() == "identifier")
.map(|c| &source[c.byte_range()])
.unwrap_or_default();
if name != "self" && name != "cls" && !name.is_empty() {
let ty = source[type_node.byte_range()].trim();
parts.push(format!("{name}: {ty}"));
}
}
}
"identifier" => {
let name = &source[param.byte_range()];
if name != "self" && name != "cls" {
parts.push(name.to_string());
}
}
_ => {}
}
}
parts.join(", ")
}
fn collect_constructor_params(
body: &Node,
source: &str,
class_name: &str,
ctx: &mut ExtractionContext,
) {
let mut cursor = body.walk();
for child in body.children(&mut cursor) {
if child.kind() != "function_definition" {
continue;
}
let Some(fn_name) = child.child_by_field_name("name") else {
continue;
};
if &source[fn_name.byte_range()] != "__init__" {
continue;
}
let Some(params) = child.child_by_field_name("parameters") else {
continue;
};
let mut param_cursor = params.walk();
for param in params.children(&mut param_cursor) {
if param.kind() == "typed_parameter"
&& let Some(type_node) = param.child_by_field_name("type")
{
let type_text = &source[type_node.byte_range()];
let base_type = type_text.split('[').next().unwrap_or(type_text).trim();
if base_type != class_name
&& !base_type.is_empty()
&& !is_python_builtin(base_type)
&& let Ok(rel) =
Relationship::new(class_name, base_type, RelationshipKind::Composition)
{
ctx.add_relationship(rel);
}
}
}
}
}
fn collect_typed_fields(body: &Node, source: &str, class_name: &str, ctx: &mut ExtractionContext) {
collect_typed_fields_recursive(body, source, class_name, ctx);
}
fn collect_typed_fields_recursive(
node: &Node,
source: &str,
class_name: &str,
ctx: &mut ExtractionContext,
) {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if (child.kind() == "assignment" || child.kind() == "typed_assignment")
&& let Some(type_node) = child.child_by_field_name("type")
{
let type_text = &source[type_node.byte_range()];
let base_type = type_text.split('[').next().unwrap_or(type_text).trim();
if base_type != class_name
&& !base_type.is_empty()
&& !is_python_builtin(base_type)
&& let Ok(rel) =
Relationship::new(class_name, base_type, RelationshipKind::Composition)
{
ctx.add_relationship(rel);
}
}
collect_typed_fields_recursive(&child, source, class_name, ctx);
}
}