Spaces:
Running
Running
| """Document entity model for representing source files.""" | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Optional | |
| class DocumentMetadata: | |
| """Metadata for a source document.""" | |
| file_path: str | |
| language: Optional[str] = None | |
| size_bytes: int = 0 | |
| line_count: int = 0 | |
| encoding: str = "utf-8" | |
| def extension(self) -> str: | |
| """Get file extension.""" | |
| return Path(self.file_path).suffix.lstrip(".") | |
| class Document: | |
| """Represents a source code file loaded for processing.""" | |
| content: str | |
| metadata: DocumentMetadata | |
| repo_id: str = "" | |
| def file_path(self) -> str: | |
| """Convenience accessor for file path.""" | |
| return self.metadata.file_path | |
| def language(self) -> Optional[str]: | |
| """Convenience accessor for language.""" | |
| return self.metadata.language | |
| def from_file(cls, file_path: Path, repo_root: Path, repo_id: str = "") -> "Document": | |
| """Create Document from a file path.""" | |
| content = file_path.read_text(encoding="utf-8") | |
| relative_path = str(file_path.relative_to(repo_root)) | |
| line_count = content.count("\n") + 1 if content else 0 | |
| language = _detect_language(file_path.suffix) | |
| metadata = DocumentMetadata( | |
| file_path=relative_path, | |
| language=language, | |
| size_bytes=file_path.stat().st_size, | |
| line_count=line_count, | |
| ) | |
| return cls(content=content, metadata=metadata, repo_id=repo_id) | |
| def _detect_language(extension: str) -> Optional[str]: | |
| """Detect programming language from file extension.""" | |
| extension_map = { | |
| ".py": "python", | |
| ".js": "javascript", | |
| ".ts": "typescript", | |
| ".jsx": "javascript", | |
| ".tsx": "typescript", | |
| ".java": "java", | |
| ".go": "go", | |
| ".rs": "rust", | |
| ".rb": "ruby", | |
| ".php": "php", | |
| ".c": "c", | |
| ".cpp": "cpp", | |
| ".h": "c", | |
| ".hpp": "cpp", | |
| ".cs": "csharp", | |
| ".swift": "swift", | |
| ".kt": "kotlin", | |
| ".scala": "scala", | |
| ".md": "markdown", | |
| ".rst": "restructuredtext", | |
| ".yaml": "yaml", | |
| ".yml": "yaml", | |
| ".json": "json", | |
| ".toml": "toml", | |
| ".xml": "xml", | |
| ".html": "html", | |
| ".css": "css", | |
| ".sql": "sql", | |
| ".sh": "bash", | |
| ".bash": "bash", | |
| ".zsh": "zsh", | |
| } | |
| return extension_map.get(extension.lower()) | |