File size: 2,391 Bytes
d16e9aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Tests for the PDF processor module.
"""
import pytest
from io import BytesIO
from src.pdf_processor import PDFProcessor

def test_clean_text():
    """Test text cleaning functionality."""
    processor = PDFProcessor()
    
    # Test removing extra whitespace
    text = "This   has    extra   spaces"
    assert processor.clean_text(text) == "This has extra spaces"
    
    # Test normalizing newlines
    text = "Line1\r\nLine2\r\nLine3"
    assert processor.clean_text(text) == "Line1 Line2 Line3"
    
    # Test removing null characters
    text = "Text with\x00null\x00chars"
    assert processor.clean_text(text) == "Text with null chars"

def test_create_chunks():
    """Test text chunking functionality."""
    processor = PDFProcessor()
    
    # Test basic chunking
    text = "This is a test. This is another test. And a final test."
    chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
    
    assert len(chunks) > 0
    assert all(isinstance(chunk, tuple) for chunk in chunks)
    assert all(len(chunk) == 2 for chunk in chunks)  # (text, metadata)
    assert all(isinstance(chunk[1], dict) for chunk in chunks)  # metadata is dict

def test_chunk_metadata():
    """Test chunk metadata creation."""
    processor = PDFProcessor()
    
    text = "Short test text."
    chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
    
    assert len(chunks) == 1
    chunk_text, metadata = chunks[0]
    
    assert "start_char" in metadata
    assert "end_char" in metadata
    assert "chunk_size" in metadata
    assert metadata["chunk_size"] == len(chunk_text)

def test_empty_text():
    """Test handling of empty text."""
    processor = PDFProcessor()
    
    chunks = processor.create_chunks("")
    assert len(chunks) == 0

def test_chunk_overlap():
    """Test chunk overlap functionality."""
    processor = PDFProcessor()
    
    text = "This is a long text that should be split into multiple chunks with overlap."
    chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
    
    # Check that chunks overlap
    if len(chunks) > 1:
        for i in range(len(chunks) - 1):
            current_chunk = chunks[i][0]
            next_chunk = chunks[i + 1][0]
            
            # There should be some overlap between consecutive chunks
            assert any(word in next_chunk for word in current_chunk.split()[-3:])