code-to-doc-streamlit / code_analyzer2.py
vijayvizag's picture
readme update
bcb80f2
raw
history blame
9.8 kB
from transformers import pipeline
import os
import glob
import ast
import re
from typing import List, Dict, Set, Any
import pkg_resources
import importlib.util
from collections import defaultdict
import huggingface_hub
class CodeAnalyzer2:
def __init__(self):
# Using different models for different types of analysis
self.summarizer = pipeline("summarization", model="Graverman/t5-code-summary")
def detect_technologies(self, code_files: Dict[str, str]) -> Dict[str, Any]:
"""Detect technologies used in the project"""
tech_stack = {
"languages": set(),
"frameworks": set(),
"dependencies": set()
}
# Detect languages
extensions_map = {
'.py': 'Python',
'.js': 'JavaScript',
'.jsx': 'React/JavaScript',
'.ts': 'TypeScript',
'.tsx': 'React/TypeScript',
'.java': 'Java'
}
for file_path in code_files.keys():
ext = os.path.splitext(file_path)[1]
if ext in extensions_map:
tech_stack["languages"].add(extensions_map[ext])
# Analyze Python dependencies
for file_path, content in code_files.items():
if file_path.endswith('.py'):
try:
tree = ast.parse(content)
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for name in node.names:
tech_stack["dependencies"].add(name.name.split('.')[0])
elif isinstance(node, ast.ImportFrom):
if node.module:
tech_stack["dependencies"].add(node.module.split('.')[0])
except:
continue
# Check if common frameworks are used
framework_indicators = {
'django': 'Django',
'flask': 'Flask',
'fastapi': 'FastAPI',
'react': 'React',
'angular': 'Angular',
'vue': 'Vue.js',
'spring': 'Spring',
'tensorflow': 'TensorFlow',
'torch': 'PyTorch',
'pandas': 'Pandas',
'numpy': 'NumPy'
}
for dep in tech_stack["dependencies"]:
if dep.lower() in framework_indicators:
tech_stack["frameworks"].add(framework_indicators[dep.lower()])
return {k: list(v) for k, v in tech_stack.items()}
def analyze_code_complexity(self, code_files: Dict[str, str]) -> Dict[str, Any]:
"""Analyze code complexity metrics"""
metrics = {
"total_lines": 0,
"code_lines": 0,
"class_count": 0,
"function_count": 0,
"complexity_score": 0
}
for file_path, content in code_files.items():
if file_path.endswith('.py'):
try:
tree = ast.parse(content)
metrics["class_count"] += sum(1 for node in ast.walk(tree) if isinstance(node, ast.ClassDef))
metrics["function_count"] += sum(1 for node in ast.walk(tree) if isinstance(node, ast.FunctionDef))
lines = content.split('\n')
metrics["total_lines"] += len(lines)
metrics["code_lines"] += sum(1 for line in lines if line.strip() and not line.strip().startswith('#'))
# Simple complexity score based on nesting depth and branches
complexity = 0
for node in ast.walk(tree):
if isinstance(node, (ast.If, ast.For, ast.While, ast.Try)):
complexity += 1
metrics["complexity_score"] += complexity
except:
continue
return metrics
def identify_objective(self, code_files: Dict[str, str]) -> str:
"""Identify the main objective of the project"""
# Combine all Python docstrings and comments
all_docs = []
for file_path, content in code_files.items():
if file_path.endswith('.py'):
try:
tree = ast.parse(content)
for node in ast.walk(tree):
if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.Module)):
if ast.get_docstring(node):
all_docs.append(ast.get_docstring(node))
except:
continue
combined_docs = " ".join(all_docs)
if combined_docs:
return self.summarizer(combined_docs, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
return "Unable to determine project objective from available documentation"
def read_code_files(self, directory: str) -> Dict[str, str]:
"""Read all code files from the given directory"""
code_files = {}
extensions = ['.py', '.java', '.jsx', '.js', '.ts', '.tsx']
for ext in extensions:
for file_path in glob.glob(f"{directory}/**/*{ext}", recursive=True):
try:
with open(file_path, 'r', encoding='utf-8') as f:
code_files[file_path] = f.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
return code_files
def generate_summary(self, code: str, context: str = "") -> str:
"""Generate a summary for the given code with optional context"""
if not code.strip():
return "No code provided"
# Truncate input if too long
code = code[:4000]
prompt = f"{context}\n{code}" if context else code
summary = self.summarizer(prompt, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
return summary
def analyze_project(self, project_dir: str, questions_file: str) -> Dict[str, Any]:
"""Analyze project and answer questions"""
# Read code files
code_files = self.read_code_files(project_dir)
if not code_files:
return {
"project_summary": "No code files found",
"tech_stack": {},
"metrics": {},
"objective": "No code files to analyze",
"answers": {}
}
# Perform various analyses
tech_stack = self.detect_technologies(code_files)
metrics = self.analyze_code_complexity(code_files)
objective = self.identify_objective(code_files)
# Generate overall summary
combined_code = "\n\n".join(code_files.values())
summary = self.generate_summary(combined_code)
# Read questions
with open(questions_file, 'r') as f:
questions = [line.strip() for line in f.readlines() if line.strip()]
# Generate targeted answers based on analysis results
answers = {}
for question in questions:
question_lower = question.lower()
if 'abstract' in question_lower:
answers[question] = objective
elif 'architecture' in question_lower:
arch_summary = f"Project Architecture:\n- Languages: {', '.join(tech_stack['languages'])}\n"
if tech_stack['frameworks']:
arch_summary += f"- Frameworks: {', '.join(tech_stack['frameworks'])}\n"
arch_summary += f"- Components: {metrics['class_count']} classes, {metrics['function_count']} functions"
answers[question] = arch_summary
elif 'software' in question_lower and 'requirement' in question_lower:
deps = tech_stack['dependencies']
frameworks = tech_stack['frameworks']
req_list = list(set(deps) | set(frameworks))
answers[question] = f"Software Requirements:\n- Python environment\n- Dependencies: {', '.join(req_list)}"
elif 'hardware' in question_lower and 'requirement' in question_lower:
complexity = "Low" if metrics['complexity_score'] < 10 else "Medium" if metrics['complexity_score'] < 30 else "High"
answers[question] = f"Hardware Requirements:\n- Complexity: {complexity}\n- Minimum RAM: {2 if complexity == 'Low' else 4 if complexity == 'Medium' else 8}GB\n- CPU: {1 if complexity == 'Low' else 2 if complexity == 'Medium' else 4}+ cores recommended"
else:
# For other questions, generate a contextual summary
answers[question] = self.generate_summary(combined_code, f"Context: {question}")
return {
"project_summary": summary,
"tech_stack": tech_stack,
"metrics": metrics,
"objective": objective,
"answers": answers
}
# if __name__ == "__main__":
# analyzer = CodeAnalyzer()
# # Example usage
# results = analyzer.analyze_project(
# "./example_project",
# "./questions.txt"
# )
# print("\nProject Objective:", results["objective"])
# print("\nTechnology Stack:")
# for category, items in results["tech_stack"].items():
# print(f"- {category.title()}: {', '.join(items)}")
# print("\nCode Metrics:")
# for metric, value in results["metrics"].items():
# print(f"- {metric.replace('_', ' ').title()}: {value}")
# print("\nAnswers to Questions:")
# for q, a in results["answers"].items():
# print(f"\n{q}:\n{a}")