rdtest / app.py
pentarosarium's picture
Upload 2 files
3d82a40 verified
raw
history blame contribute delete
6.77 kB
import streamlit as st
import fitz # PyMuPDF for PDF processing
import pandas as pd
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# Get the Hugging Face token from the environment variables
hf_token = os.getenv("HF_API_TOKEN")
# Load the model (Meta-Llama 3.1 8B)
@st.cache_resource
def load_model():
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
return model
model = load_model()
# Function to extract text from PDF
def extract_pdf_text(file):
doc = fitz.open(stream=file.read(), filetype="pdf")
extracted_text = ""
for page in doc:
extracted_text += page.get_text("text")
return extracted_text
# Function to chunk text into smaller sections
def chunk_text(text, max_tokens=1000):
sentences = text.split('.')
chunks = []
current_chunk = ""
current_token_count = 0
for sentence in sentences:
token_count = len(sentence.split())
if current_token_count + token_count > max_tokens:
chunks.append(current_chunk.strip())
current_chunk = sentence
current_token_count = token_count
else:
current_chunk += sentence + "."
current_token_count += token_count
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Prompt generation for extracting financial data
def generate_extraction_prompt(chunk):
return f"""
From the following text, please extract the following financial metrics in IFRS format:
- Revenue
- Net Income
- Total Assets
- Total Liabilities
- Shareholders' Equity
- Current Assets
- Current Liabilities
If the information is not found in the text, return 'Not Available'.
Text: {chunk}
"""
# Function to query Meta-Llama for each chunk
def extract_financial_metrics_from_chunk(chunk):
prompt = generate_extraction_prompt(chunk)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
nlp = pipeline("text-generation", model=model, tokenizer=tokenizer)
response = nlp(prompt)
return response[0]['generated_text']
# Process the PDF text through the model
def process_pdf_text_for_metrics(text):
chunks = chunk_text(text)
extracted_metrics = []
for chunk in chunks:
response = extract_financial_metrics_from_chunk(chunk)
extracted_metrics.append(response)
return extracted_metrics
# Function to parse the metrics from the model response
import re
def parse_metrics(extracted_text):
metrics = {}
for line in extracted_text.split("\n"):
if "Revenue" in line:
metrics['Revenue'] = re.findall(r'\d+', line) # Find numeric data
elif "Net Income" in line:
metrics['Net Income'] = re.findall(r'\d+', line)
elif "Total Assets" in line:
metrics['Total Assets'] = re.findall(r'\d+', line)
elif "Total Liabilities" in line:
metrics['Total Liabilities'] = re.findall(r'\d+', line)
elif "Shareholders' Equity" in line:
metrics['Shareholders\' Equity'] = re.findall(r'\d+', line)
elif "Current Assets" in line:
metrics['Current Assets'] = re.findall(r'\d+', line)
elif "Current Liabilities" in line:
metrics['Current Liabilities'] = re.findall(r'\d+', line)
return metrics
# Function to aggregate metrics from all chunks
def aggregate_metrics(extracted_metrics):
aggregated_metrics = {
"Revenue": None,
"Net Income": None,
"Total Assets": None,
"Total Liabilities": None,
"Shareholders' Equity": None,
"Current Assets": None,
"Current Liabilities": None
}
for metrics_text in extracted_metrics:
parsed = parse_metrics(metrics_text)
for key in parsed:
if not aggregated_metrics[key]:
aggregated_metrics[key] = parsed[key]
return aggregated_metrics
# Function to calculate financial ratios
def calculate_financial_ratios(metrics):
try:
current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0])
debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0])
roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0])
roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0])
return {
'Current Ratio': current_ratio,
'Debt to Equity': debt_to_equity,
'Return on Assets (ROA)': roa,
'Return on Equity (ROE)': roe
}
except (TypeError, KeyError, IndexError):
return "Some metrics were not extracted properly or are missing."
# Streamlit UI
st.title("Financial Ratio Extractor from IFRS Reports")
st.write("""
Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue,
Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio.
You can also ask questions about the financial data using Meta-Llama.
""")
# File uploader for PDF
uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"])
# If a PDF is uploaded
if uploaded_file:
st.write("Processing your document, please wait...")
# Extract text from PDF
pdf_text = extract_pdf_text(uploaded_file)
# Process the text through Meta-Llama for metrics extraction
extracted_metrics = process_pdf_text_for_metrics(pdf_text)
# Aggregate extracted metrics
aggregated_metrics = aggregate_metrics(extracted_metrics)
# Calculate financial ratios
financial_ratios = calculate_financial_ratios(aggregated_metrics)
# Display extracted financial ratios
st.subheader("Extracted Financial Ratios:")
if isinstance(financial_ratios, dict):
st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"]))
else:
st.write(financial_ratios)
# Asking questions to Meta-Llama
st.subheader("Ask Meta-Llama about the extracted financial data:")
question = st.text_input("Enter your question here")
if st.button("Ask Meta-Llama"):
if question:
response = model(question)
st.write("Meta-Llama's Response:")
st.write(response[0]['generated_text'])