Spaces:

vijayvizag
/

code-to-doc-streamlit

Runtime error

File size: 9,797 Bytes

from transformers import pipeline
import os
import glob
import ast
import re
from typing import List, Dict, Set, Any
import pkg_resources
import importlib.util
from collections import defaultdict
import huggingface_hub

class CodeAnalyzer2:
    def __init__(self):
        # Using different models for different types of analysis
        self.summarizer = pipeline("summarization", model="Graverman/t5-code-summary")
        
    def detect_technologies(self, code_files: Dict[str, str]) -> Dict[str, Any]:
        """Detect technologies used in the project"""
        tech_stack = {
            "languages": set(),
            "frameworks": set(),
            "dependencies": set()
        }
        
        # Detect languages
        extensions_map = {
            '.py': 'Python',
            '.js': 'JavaScript',
            '.jsx': 'React/JavaScript',
            '.ts': 'TypeScript',
            '.tsx': 'React/TypeScript',
            '.java': 'Java'
        }
        
        for file_path in code_files.keys():
            ext = os.path.splitext(file_path)[1]
            if ext in extensions_map:
                tech_stack["languages"].add(extensions_map[ext])
        
        # Analyze Python dependencies
        for file_path, content in code_files.items():
            if file_path.endswith('.py'):
                try:
                    tree = ast.parse(content)
                    for node in ast.walk(tree):
                        if isinstance(node, ast.Import):
                            for name in node.names:
                                tech_stack["dependencies"].add(name.name.split('.')[0])
                        elif isinstance(node, ast.ImportFrom):
                            if node.module:
                                tech_stack["dependencies"].add(node.module.split('.')[0])
                except:
                    continue
        
        # Check if common frameworks are used
        framework_indicators = {
            'django': 'Django',
            'flask': 'Flask',
            'fastapi': 'FastAPI',
            'react': 'React',
            'angular': 'Angular',
            'vue': 'Vue.js',
            'spring': 'Spring',
            'tensorflow': 'TensorFlow',
            'torch': 'PyTorch',
            'pandas': 'Pandas',
            'numpy': 'NumPy'
        }
        
        for dep in tech_stack["dependencies"]:
            if dep.lower() in framework_indicators:
                tech_stack["frameworks"].add(framework_indicators[dep.lower()])
        
        return {k: list(v) for k, v in tech_stack.items()}

    def analyze_code_complexity(self, code_files: Dict[str, str]) -> Dict[str, Any]:
        """Analyze code complexity metrics"""
        metrics = {
            "total_lines": 0,
            "code_lines": 0,
            "class_count": 0,
            "function_count": 0,
            "complexity_score": 0
        }
        
        for file_path, content in code_files.items():
            if file_path.endswith('.py'):
                try:
                    tree = ast.parse(content)
                    metrics["class_count"] += sum(1 for node in ast.walk(tree) if isinstance(node, ast.ClassDef))
                    metrics["function_count"] += sum(1 for node in ast.walk(tree) if isinstance(node, ast.FunctionDef))
                    
                    lines = content.split('\n')
                    metrics["total_lines"] += len(lines)
                    metrics["code_lines"] += sum(1 for line in lines if line.strip() and not line.strip().startswith('#'))
                    
                    # Simple complexity score based on nesting depth and branches
                    complexity = 0
                    for node in ast.walk(tree):
                        if isinstance(node, (ast.If, ast.For, ast.While, ast.Try)):
                            complexity += 1
                    metrics["complexity_score"] += complexity
                except:
                    continue
        
        return metrics

    def identify_objective(self, code_files: Dict[str, str]) -> str:
        """Identify the main objective of the project"""
        # Combine all Python docstrings and comments
        all_docs = []
        for file_path, content in code_files.items():
            if file_path.endswith('.py'):
                try:
                    tree = ast.parse(content)
                    for node in ast.walk(tree):
                        if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.Module)):
                            if ast.get_docstring(node):
                                all_docs.append(ast.get_docstring(node))
                except:
                    continue
        
        combined_docs = " ".join(all_docs)
        if combined_docs:
            return self.summarizer(combined_docs, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
        return "Unable to determine project objective from available documentation"

    def read_code_files(self, directory: str) -> Dict[str, str]:
        """Read all code files from the given directory"""
        code_files = {}
        extensions = ['.py', '.java', '.jsx', '.js', '.ts', '.tsx']
        
        for ext in extensions:
            for file_path in glob.glob(f"{directory}/**/*{ext}", recursive=True):
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        code_files[file_path] = f.read()
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
        
        return code_files

    def generate_summary(self, code: str, context: str = "") -> str:
        """Generate a summary for the given code with optional context"""
        if not code.strip():
            return "No code provided"
        
        # Truncate input if too long
        code = code[:4000]
        prompt = f"{context}\n{code}" if context else code
        
        summary = self.summarizer(prompt, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
        return summary

    def analyze_project(self, project_dir: str, questions_file: str) -> Dict[str, Any]:
        """Analyze project and answer questions"""
        # Read code files
        code_files = self.read_code_files(project_dir)
        
        if not code_files:
            return {
                "project_summary": "No code files found",
                "tech_stack": {},
                "metrics": {},
                "objective": "No code files to analyze",
                "answers": {}
            }
        
        # Perform various analyses
        tech_stack = self.detect_technologies(code_files)
        metrics = self.analyze_code_complexity(code_files)
        objective = self.identify_objective(code_files)
        
        # Generate overall summary
        combined_code = "\n\n".join(code_files.values())
        summary = self.generate_summary(combined_code)
        
        # Read questions
        with open(questions_file, 'r') as f:
            questions = [line.strip() for line in f.readlines() if line.strip()]
        
        # Generate targeted answers based on analysis results
        answers = {}
        for question in questions:
            question_lower = question.lower()
            if 'abstract' in question_lower:
                answers[question] = objective
            elif 'architecture' in question_lower:
                arch_summary = f"Project Architecture:\n- Languages: {', '.join(tech_stack['languages'])}\n"
                if tech_stack['frameworks']:
                    arch_summary += f"- Frameworks: {', '.join(tech_stack['frameworks'])}\n"
                arch_summary += f"- Components: {metrics['class_count']} classes, {metrics['function_count']} functions"
                answers[question] = arch_summary
            elif 'software' in question_lower and 'requirement' in question_lower:
                deps = tech_stack['dependencies']
                frameworks = tech_stack['frameworks']
                req_list = list(set(deps) | set(frameworks))
                answers[question] = f"Software Requirements:\n- Python environment\n- Dependencies: {', '.join(req_list)}"
            elif 'hardware' in question_lower and 'requirement' in question_lower:
                complexity = "Low" if metrics['complexity_score'] < 10 else "Medium" if metrics['complexity_score'] < 30 else "High"
                answers[question] = f"Hardware Requirements:\n- Complexity: {complexity}\n- Minimum RAM: {2 if complexity == 'Low' else 4 if complexity == 'Medium' else 8}GB\n- CPU: {1 if complexity == 'Low' else 2 if complexity == 'Medium' else 4}+ cores recommended"
            else:
                # For other questions, generate a contextual summary
                answers[question] = self.generate_summary(combined_code, f"Context: {question}")
        
        return {
            "project_summary": summary,
            "tech_stack": tech_stack,
            "metrics": metrics,
            "objective": objective,
            "answers": answers
        }

# if __name__ == "__main__":
#     analyzer = CodeAnalyzer()
#     # Example usage
#     results = analyzer.analyze_project(
#         "./example_project",
#         "./questions.txt"
#     )
#     print("\nProject Objective:", results["objective"])
#     print("\nTechnology Stack:")
#     for category, items in results["tech_stack"].items():
#         print(f"- {category.title()}: {', '.join(items)}")
    
#     print("\nCode Metrics:")
#     for metric, value in results["metrics"].items():
#         print(f"- {metric.replace('_', ' ').title()}: {value}")
    
#     print("\nAnswers to Questions:")
#     for q, a in results["answers"].items():
#         print(f"\n{q}:\n{a}")