@@ -18,6 +18,7 @@ def test_get_usage_info(client_v2):
18
18
"current_page_count_form" ,
19
19
"current_page_count_high_quality" ,
20
20
"current_page_count_native_text" ,
21
+ "current_page_count_excel" ,
21
22
"daily_quota" ,
22
23
"monthly_quota" ,
23
24
"overage_page_count" ,
@@ -44,7 +45,10 @@ def test_get_usage_info(client_v2):
44
45
def test_whisper_v2 (client_v2 , data_dir , output_mode , mode , input_file ):
45
46
file_path = os .path .join (data_dir , input_file )
46
47
whisper_result = client_v2 .whisper (
47
- mode = mode , output_mode = output_mode , file_path = file_path , wait_for_completion = True
48
+ mode = mode ,
49
+ output_mode = output_mode ,
50
+ file_path = file_path ,
51
+ wait_for_completion = True ,
48
52
)
49
53
logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
50
54
@@ -54,24 +58,62 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
54
58
assert_extracted_text (exp_file , whisper_result , mode , output_mode )
55
59
56
60
61
+ @pytest .mark .parametrize (
62
+ "output_mode, mode, input_file" ,
63
+ [
64
+ ("layout_preserving" , "high_quality" , "test.json" ),
65
+ ],
66
+ )
67
+ def test_whisper_v2_error (client_v2 , data_dir , output_mode , mode , input_file ):
68
+ file_path = os .path .join (data_dir , input_file )
69
+
70
+ whisper_result = client_v2 .whisper (
71
+ mode = mode ,
72
+ output_mode = output_mode ,
73
+ file_path = file_path ,
74
+ wait_for_completion = True ,
75
+ )
76
+ logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
77
+
78
+ assert_error_message (whisper_result )
79
+
80
+
57
81
@pytest .mark .parametrize (
58
82
"output_mode, mode, url, input_file, page_count" ,
59
83
[
60
- ("layout_preserving" , "native_text" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
61
- "credit_card.pdf" , 7 ),
62
- ("layout_preserving" , "low_cost" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
63
- "credit_card.pdf" , 7 ),
64
- ("layout_preserving" , "high_quality" , "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf" ,
65
- "restaurant_invoice_photo.pdf" , 1 ),
66
- ("layout_preserving" , "form" , "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf" ,
67
- "handwritten-form.pdf" , 1 ),
68
- ]
84
+ (
85
+ "layout_preserving" ,
86
+ "native_text" ,
87
+ "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
88
+ "credit_card.pdf" ,
89
+ 7 ,
90
+ ),
91
+ (
92
+ "layout_preserving" ,
93
+ "low_cost" ,
94
+ "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
95
+ "credit_card.pdf" ,
96
+ 7 ,
97
+ ),
98
+ (
99
+ "layout_preserving" ,
100
+ "high_quality" ,
101
+ "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf" ,
102
+ "restaurant_invoice_photo.pdf" ,
103
+ 1 ,
104
+ ),
105
+ (
106
+ "layout_preserving" ,
107
+ "form" ,
108
+ "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf" ,
109
+ "handwritten-form.pdf" ,
110
+ 1 ,
111
+ ),
112
+ ],
69
113
)
70
114
def test_whisper_v2_url_in_post (client_v2 , data_dir , output_mode , mode , url , input_file , page_count ):
71
115
usage_before = client_v2 .get_usage_info ()
72
- whisper_result = client_v2 .whisper (
73
- mode = mode , output_mode = output_mode , url = url , wait_for_completion = True
74
- )
116
+ whisper_result = client_v2 .whisper (mode = mode , output_mode = output_mode , url = url , wait_for_completion = True )
75
117
logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
76
118
77
119
exp_basename = f"{ Path (input_file ).stem } .{ mode } .{ output_mode } .txt"
@@ -83,6 +125,12 @@ def test_whisper_v2_url_in_post(client_v2, data_dir, output_mode, mode, url, inp
83
125
verify_usage (usage_before , usage_after , page_count , mode )
84
126
85
127
128
+ def assert_error_message (whisper_result ):
129
+ assert isinstance (whisper_result , dict )
130
+ assert whisper_result ["status" ] == "error"
131
+ assert "error" in whisper_result ["message" ]
132
+
133
+
86
134
def assert_extracted_text (file_path , whisper_result , mode , output_mode ):
87
135
with open (file_path , encoding = "utf-8" ) as f :
88
136
exp = f .read ()
@@ -91,34 +139,45 @@ def assert_extracted_text(file_path, whisper_result, mode, output_mode):
91
139
assert whisper_result ["status_code" ] == 200
92
140
93
141
# For OCR based processing
94
- threshold = 0.97
142
+ threshold = 0.94
95
143
96
144
# For text based processing
97
145
if mode == "native_text" and output_mode == "text" :
98
146
threshold = 0.99
147
+ elif mode == "low_cost" :
148
+ threshold = 0.90
99
149
extracted_text = whisper_result ["extraction" ]["result_text" ]
100
150
similarity = SequenceMatcher (None , extracted_text , exp ).ratio ()
101
151
102
152
if similarity < threshold :
103
153
diff = "\n " .join (
104
- unified_diff (exp .splitlines (), extracted_text .splitlines (), fromfile = "Expected" , tofile = "Extracted" )
154
+ unified_diff (
155
+ exp .splitlines (),
156
+ extracted_text .splitlines (),
157
+ fromfile = "Expected" ,
158
+ tofile = "Extracted" ,
159
+ )
105
160
)
106
- pytest .fail (f"Texts are not similar enough: { similarity * 100 :.2f} % similarity. Diff: \n { diff } " )
161
+ pytest .fail (f"Diff: \n { diff } . \n Texts are not similar enough: { similarity * 100 :.2f} % similarity. " )
107
162
108
163
109
- def verify_usage (before_extract , after_extract , page_count , mode = ' form' ):
110
- all_modes = [' form' , ' high_quality' , ' low_cost' , ' native_text' ]
164
+ def verify_usage (before_extract , after_extract , page_count , mode = " form" ):
165
+ all_modes = [" form" , " high_quality" , " low_cost" , " native_text" ]
111
166
all_modes .remove (mode )
112
- assert (after_extract ['today_page_count' ] == before_extract ['today_page_count' ] + page_count ), \
113
- "today_page_count calculation is wrong"
114
- assert (after_extract ['current_page_count' ] == before_extract ['current_page_count' ] + page_count ), \
115
- "current_page_count calculation is wrong"
116
- if after_extract ['overage_page_count' ] > 0 :
117
- assert (after_extract ['overage_page_count' ] == before_extract ['overage_page_count' ] + page_count ), \
118
- "overage_page_count calculation is wrong"
119
- assert (after_extract [f'current_page_count_{ mode } ' ] == before_extract [f'current_page_count_{ mode } ' ] + page_count ), \
120
- f"{ mode } mode calculation is wrong"
167
+ assert (
168
+ after_extract ["today_page_count" ] == before_extract ["today_page_count" ] + page_count
169
+ ), "today_page_count calculation is wrong"
170
+ assert (
171
+ after_extract ["current_page_count" ] == before_extract ["current_page_count" ] + page_count
172
+ ), "current_page_count calculation is wrong"
173
+ if after_extract ["overage_page_count" ] > 0 :
174
+ assert (
175
+ after_extract ["overage_page_count" ] == before_extract ["overage_page_count" ] + page_count
176
+ ), "overage_page_count calculation is wrong"
177
+ assert (
178
+ after_extract [f"current_page_count_{ mode } " ] == before_extract [f"current_page_count_{ mode } " ] + page_count
179
+ ), f"{ mode } mode calculation is wrong"
121
180
for i in range (len (all_modes )):
122
- assert (after_extract [ f'current_page_count_ { all_modes [ i ] } ' ] ==
123
- before_extract [f' current_page_count_{ all_modes [i ]} ' ]), \
124
- f"{ all_modes [i ]} mode calculation is wrong"
181
+ assert (
182
+ after_extract [ f"current_page_count_ { all_modes [ i ] } " ] == before_extract [f" current_page_count_{ all_modes [i ]} " ]
183
+ ), f"{ all_modes [i ]} mode calculation is wrong"
0 commit comments