File size: 5,220 Bytes
05b45a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""Tests for text normalization service"""

import pytest

from api.src.services.text_processing.normalizer import normalize_text
from api.src.structures.schemas import NormalizationOptions


def test_url_protocols():
    """Test URL protocol handling"""
    assert (
        normalize_text(
            "Check out https://example.com",
            normalization_options=NormalizationOptions(),
        )
        == "Check out https example dot com"
    )
    assert (
        normalize_text(
            "Visit http://site.com", normalization_options=NormalizationOptions()
        )
        == "Visit http site dot com"
    )
    assert (
        normalize_text(
            "Go to https://test.org/path", normalization_options=NormalizationOptions()
        )
        == "Go to https test dot org slash path"
    )


def test_url_www():
    """Test www prefix handling"""
    assert (
        normalize_text(
            "Go to www.example.com", normalization_options=NormalizationOptions()
        )
        == "Go to www example dot com"
    )
    assert (
        normalize_text(
            "Visit www.test.org/docs", normalization_options=NormalizationOptions()
        )
        == "Visit www test dot org slash docs"
    )
    assert (
        normalize_text(
            "Check www.site.com?q=test", normalization_options=NormalizationOptions()
        )
        == "Check www site dot com question-mark q equals test"
    )


def test_url_localhost():
    """Test localhost URL handling"""
    assert (
        normalize_text(
            "Running on localhost:7860", normalization_options=NormalizationOptions()
        )
        == "Running on localhost colon 78 60"
    )
    assert (
        normalize_text(
            "Server at localhost:8080/api", normalization_options=NormalizationOptions()
        )
        == "Server at localhost colon 80 80 slash api"
    )
    assert (
        normalize_text(
            "Test localhost:3000/test?v=1", normalization_options=NormalizationOptions()
        )
        == "Test localhost colon 3000 slash test question-mark v equals 1"
    )


def test_url_ip_addresses():
    """Test IP address URL handling"""
    assert (
        normalize_text(
            "Access 0.0.0.0:9090/test", normalization_options=NormalizationOptions()
        )
        == "Access 0 dot 0 dot 0 dot 0 colon 90 90 slash test"
    )
    assert (
        normalize_text(
            "API at 192.168.1.1:8000", normalization_options=NormalizationOptions()
        )
        == "API at 192 dot 168 dot 1 dot 1 colon 8000"
    )
    assert (
        normalize_text("Server 127.0.0.1", normalization_options=NormalizationOptions())
        == "Server 127 dot 0 dot 0 dot 1"
    )


def test_url_raw_domains():
    """Test raw domain handling"""
    assert (
        normalize_text(
            "Visit google.com/search", normalization_options=NormalizationOptions()
        )
        == "Visit google dot com slash search"
    )
    assert (
        normalize_text(
            "Go to example.com/path?q=test",
            normalization_options=NormalizationOptions(),
        )
        == "Go to example dot com slash path question-mark q equals test"
    )
    assert (
        normalize_text(
            "Check docs.test.com", normalization_options=NormalizationOptions()
        )
        == "Check docs dot test dot com"
    )


def test_url_email_addresses():
    """Test email address handling"""
    assert (
        normalize_text(
            "Email me at [email protected]", normalization_options=NormalizationOptions()
        )
        == "Email me at user at example dot com"
    )
    assert (
        normalize_text(
            "Contact [email protected]", normalization_options=NormalizationOptions()
        )
        == "Contact admin at test dot org"
    )
    assert (
        normalize_text(
            "Send to [email protected]", normalization_options=NormalizationOptions()
        )
        == "Send to test dot user at site dot com"
    )


def test_money():
    """Test that money text is normalized correctly"""
    assert (
        normalize_text(
            "He lost $5.3 thousand.", normalization_options=NormalizationOptions()
        )
        == "He lost five point three thousand dollars."
    )
    assert (
        normalize_text(
            "To put it weirdly -$6.9 million",
            normalization_options=NormalizationOptions(),
        )
        == "To put it weirdly minus six point nine million dollars"
    )
    assert (
        normalize_text("It costs $50.3.", normalization_options=NormalizationOptions())
        == "It costs fifty dollars and thirty cents."
    )


def test_non_url_text():
    """Test that non-URL text is unaffected"""
    assert (
        normalize_text(
            "This is not.a.url text", normalization_options=NormalizationOptions()
        )
        == "This is not-a-url text"
    )
    assert (
        normalize_text(
            "Hello, how are you today?", normalization_options=NormalizationOptions()
        )
        == "Hello, how are you today?"
    )
    assert (
        normalize_text("It costs $50.", normalization_options=NormalizationOptions())
        == "It costs fifty dollars."
    )