Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF for PDF processing | |
import pandas as pd | |
import os | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
# Get the Hugging Face token from the environment variables | |
hf_token = os.getenv("HF_API_TOKEN") | |
# Load the model (Meta-Llama 3.1 8B) | |
def load_model(): | |
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) | |
return model | |
model = load_model() | |
# Function to extract text from PDF | |
def extract_pdf_text(file): | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
extracted_text = "" | |
for page in doc: | |
extracted_text += page.get_text("text") | |
return extracted_text | |
# Function to chunk text into smaller sections | |
def chunk_text(text, max_tokens=1000): | |
sentences = text.split('.') | |
chunks = [] | |
current_chunk = "" | |
current_token_count = 0 | |
for sentence in sentences: | |
token_count = len(sentence.split()) | |
if current_token_count + token_count > max_tokens: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
current_token_count = token_count | |
else: | |
current_chunk += sentence + "." | |
current_token_count += token_count | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |
# Prompt generation for extracting financial data | |
def generate_extraction_prompt(chunk): | |
return f""" | |
From the following text, please extract the following financial metrics in IFRS format: | |
- Revenue | |
- Net Income | |
- Total Assets | |
- Total Liabilities | |
- Shareholders' Equity | |
- Current Assets | |
- Current Liabilities | |
If the information is not found in the text, return 'Not Available'. | |
Text: {chunk} | |
""" | |
# Function to query Meta-Llama for each chunk | |
def extract_financial_metrics_from_chunk(chunk): | |
prompt = generate_extraction_prompt(chunk) | |
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) | |
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) | |
nlp = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
response = nlp(prompt) | |
return response[0]['generated_text'] | |
# Process the PDF text through the model | |
def process_pdf_text_for_metrics(text): | |
chunks = chunk_text(text) | |
extracted_metrics = [] | |
for chunk in chunks: | |
response = extract_financial_metrics_from_chunk(chunk) | |
extracted_metrics.append(response) | |
return extracted_metrics | |
# Function to parse the metrics from the model response | |
import re | |
def parse_metrics(extracted_text): | |
metrics = {} | |
for line in extracted_text.split("\n"): | |
if "Revenue" in line: | |
metrics['Revenue'] = re.findall(r'\d+', line) # Find numeric data | |
elif "Net Income" in line: | |
metrics['Net Income'] = re.findall(r'\d+', line) | |
elif "Total Assets" in line: | |
metrics['Total Assets'] = re.findall(r'\d+', line) | |
elif "Total Liabilities" in line: | |
metrics['Total Liabilities'] = re.findall(r'\d+', line) | |
elif "Shareholders' Equity" in line: | |
metrics['Shareholders\' Equity'] = re.findall(r'\d+', line) | |
elif "Current Assets" in line: | |
metrics['Current Assets'] = re.findall(r'\d+', line) | |
elif "Current Liabilities" in line: | |
metrics['Current Liabilities'] = re.findall(r'\d+', line) | |
return metrics | |
# Function to aggregate metrics from all chunks | |
def aggregate_metrics(extracted_metrics): | |
aggregated_metrics = { | |
"Revenue": None, | |
"Net Income": None, | |
"Total Assets": None, | |
"Total Liabilities": None, | |
"Shareholders' Equity": None, | |
"Current Assets": None, | |
"Current Liabilities": None | |
} | |
for metrics_text in extracted_metrics: | |
parsed = parse_metrics(metrics_text) | |
for key in parsed: | |
if not aggregated_metrics[key]: | |
aggregated_metrics[key] = parsed[key] | |
return aggregated_metrics | |
# Function to calculate financial ratios | |
def calculate_financial_ratios(metrics): | |
try: | |
current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0]) | |
debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0]) | |
roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0]) | |
roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0]) | |
return { | |
'Current Ratio': current_ratio, | |
'Debt to Equity': debt_to_equity, | |
'Return on Assets (ROA)': roa, | |
'Return on Equity (ROE)': roe | |
} | |
except (TypeError, KeyError, IndexError): | |
return "Some metrics were not extracted properly or are missing." | |
# Streamlit UI | |
st.title("Financial Ratio Extractor from IFRS Reports") | |
st.write(""" | |
Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue, | |
Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio. | |
You can also ask questions about the financial data using Meta-Llama. | |
""") | |
# File uploader for PDF | |
uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"]) | |
# If a PDF is uploaded | |
if uploaded_file: | |
st.write("Processing your document, please wait...") | |
# Extract text from PDF | |
pdf_text = extract_pdf_text(uploaded_file) | |
# Process the text through Meta-Llama for metrics extraction | |
extracted_metrics = process_pdf_text_for_metrics(pdf_text) | |
# Aggregate extracted metrics | |
aggregated_metrics = aggregate_metrics(extracted_metrics) | |
# Calculate financial ratios | |
financial_ratios = calculate_financial_ratios(aggregated_metrics) | |
# Display extracted financial ratios | |
st.subheader("Extracted Financial Ratios:") | |
if isinstance(financial_ratios, dict): | |
st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"])) | |
else: | |
st.write(financial_ratios) | |
# Asking questions to Meta-Llama | |
st.subheader("Ask Meta-Llama about the extracted financial data:") | |
question = st.text_input("Enter your question here") | |
if st.button("Ask Meta-Llama"): | |
if question: | |
response = model(question) | |
st.write("Meta-Llama's Response:") | |
st.write(response[0]['generated_text']) | |