UshaKiranmai's picture
Create app.py
57682dc verified
raw
history blame contribute delete
3.59 kB
import gradio as gr
from transformers import pipeline
import torch
from fpdf import FPDF
import pandas as pd
import json
import csv
# Load the summarization pipeline
text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.float32)
def chunk_text(input_text, max_chunk_size=1024):
"""
Splits the input text into smaller chunks of size `max_chunk_size` or smaller.
"""
words = input_text.split()
chunks = []
current_chunk = []
for word in words:
if len(" ".join(current_chunk + [word])) <= max_chunk_size:
current_chunk.append(word)
else:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def summary(input_text, max_length=130, min_length=30, output_format="Plain Text"):
"""
Summarizes the input text, handling cases where the text exceeds the model's maximum sequence length.
Supports different output formats (Plain Text, JSON, HTML, CSV, Markdown, PDF, Excel).
"""
chunks = chunk_text(input_text)
summarized_chunks = []
for chunk in chunks:
output = text_summary(chunk, max_length=max_length, min_length=min_length)
summarized_chunks.append(output[0]['summary_text'])
summary_text = " ".join(summarized_chunks)
# Return the output in the selected format
if output_format == "Plain Text":
return summary_text
elif output_format == "JSON":
result = {
"summary": summary_text,
"chunk_count": len(chunks),
"original_length": len(input_text.split()),
"summary_length": len(summary_text.split())
}
return json.dumps(result, indent=4)
elif output_format == "HTML":
html_output = f"<html><body><h2>Summary</h2><p>{summary_text}</p></body></html>"
return html_output
elif output_format == "CSV":
csv_output = "Original Text, Summary\n"
for chunk, summary in zip(chunks, summarized_chunks):
csv_output += f'"{chunk}", "{summary}"\n'
return csv_output
elif output_format == "Markdown":
markdown_output = f"## Summary\n\n{summary_text}"
return markdown_output
elif output_format == "PDF":
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, summary_text)
pdf_output = "summary.pdf"
pdf.output(pdf_output)
return f"PDF generated: {pdf_output}"
elif output_format == "Excel":
data = {
"Original Text": chunks,
"Summary": summarized_chunks
}
df = pd.DataFrame(data)
excel_output = "summary.xlsx"
df.to_excel(excel_output, index=False)
return f"Excel file generated: {excel_output}"
# Create a Gradio interface with an additional output format selection
iface = gr.Interface(
fn=summary,
inputs=[
gr.Textbox(label="Input Text", lines=10),
gr.Slider(label="Max Length", minimum=30, maximum=300, step=10, value=130),
gr.Slider(label="Min Length", minimum=20, maximum=100, step=10, value=30),
gr.Dropdown(label="Output Format", choices=["Plain Text", "JSON", "HTML", "CSV", "Markdown", "PDF", "Excel"], value="Plain Text")
],
outputs=gr.Textbox(label="Summarized Output"),
title="Text Summarization with Advanced Output Formats"
)
iface.launch()