File size: 1,734 Bytes
66340f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import re
from app.parser.parsers.pdf import PdfParser
from app.parser.parsers.txt import TxtParser
from app.parser.parsers.docx import DocxParser


from tests.parser.data import dummy_docx, dummy_pdf, dummy_txt


def standarize(text):
    return re.sub(r"\s+", " ", text).strip()


def test_pdf_parser():
    text = (
        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor"
        "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud"
        "exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
    )

    temp_file_path = dummy_pdf(text)
    parsed_text = PdfParser.parse(temp_file_path)
    os.remove(temp_file_path)

    assert standarize(parsed_text) == standarize(text)


def test_txt_parser():
    text = (
        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor"
        "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud"
        "exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
    )

    temp_file_path = dummy_txt(text)
    parsed_text = TxtParser.parse(temp_file_path)
    os.remove(temp_file_path)

    assert standarize(parsed_text) == standarize(text)


def test_docx_parser():
    text = (
        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor"
        "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud"
        "exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
    )

    temp_file_path = dummy_docx(text)
    parsed_text = DocxParser.parse(temp_file_path)
    os.remove(temp_file_path)

    assert standarize(parsed_text) == standarize(text)