openpdf-backend / tests /parser /test_parser.py
zinoubm's picture
initial commit
66340f1
import os
import re
from app.parser.parsers.pdf import PdfParser
from app.parser.parsers.txt import TxtParser
from app.parser.parsers.docx import DocxParser
from tests.parser.data import dummy_docx, dummy_pdf, dummy_txt
def standarize(text):
return re.sub(r"\s+", " ", text).strip()
def test_pdf_parser():
text = (
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor"
"incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud"
"exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
)
temp_file_path = dummy_pdf(text)
parsed_text = PdfParser.parse(temp_file_path)
os.remove(temp_file_path)
assert standarize(parsed_text) == standarize(text)
def test_txt_parser():
text = (
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor"
"incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud"
"exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
)
temp_file_path = dummy_txt(text)
parsed_text = TxtParser.parse(temp_file_path)
os.remove(temp_file_path)
assert standarize(parsed_text) == standarize(text)
def test_docx_parser():
text = (
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor"
"incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud"
"exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
)
temp_file_path = dummy_docx(text)
parsed_text = DocxParser.parse(temp_file_path)
os.remove(temp_file_path)
assert standarize(parsed_text) == standarize(text)