Skip to content

Commit 415866c

Browse files
authored
Merge pull request #10 from aws-samples/idp-lending
(enhance): Update Mortgage with Amazon Textract Analyze Lending
2 parents 224c303 + 206e466 commit 415866c

25 files changed

+756
-43922
lines changed

industry/mortgage/01-document-classification.ipynb

Lines changed: 219 additions & 222 deletions
Large diffs are not rendered by default.

industry/mortgage/02-document-extraction-1.ipynb

Lines changed: 0 additions & 1640 deletions
This file was deleted.

industry/mortgage/02-document-extraction.ipynb

Lines changed: 502 additions & 0 deletions
Large diffs are not rendered by default.

industry/mortgage/04-document-enrichment.ipynb renamed to industry/mortgage/03-document-enrichment.ipynb

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
{
2828
"cell_type": "code",
2929
"execution_count": null,
30-
"metadata": {},
30+
"metadata": {
31+
"tags": []
32+
},
3133
"outputs": [],
3234
"source": [
3335
"from textractcaller.t_call import call_textract, Textract_Features, Textract_Types\n",
@@ -39,7 +41,8 @@
3941
"cell_type": "code",
4042
"execution_count": null,
4143
"metadata": {
42-
"scrolled": true
44+
"scrolled": true,
45+
"tags": []
4346
},
4447
"outputs": [],
4548
"source": [
@@ -93,7 +96,7 @@
9396
"metadata": {},
9497
"outputs": [],
9598
"source": [
96-
"documentName = \"docs/Paystub.jpg\"\n",
99+
"documentName = \"docs/Paystub.png\"\n",
97100
"display(Image(filename=documentName, width=500))"
98101
]
99102
},
@@ -110,11 +113,15 @@
110113
"cell_type": "code",
111114
"execution_count": null,
112115
"metadata": {
113-
"scrolled": true
116+
"scrolled": true,
117+
"tags": []
114118
},
115119
"outputs": [],
116120
"source": [
117-
"resp = call_textract(input_document = f's3://{data_bucket}/idp-mortgage/textract/Paystub.jpg')\n",
121+
"with open(documentName, 'rb') as document:\n",
122+
" imageBytes = bytearray(document.read())\n",
123+
"\n",
124+
"resp = call_textract(input_document = imageBytes)\n",
118125
"text = get_string(textract_json=resp, output_type=[Textract_Pretty_Print.LINES])\n",
119126
"\n",
120127
"#Call Amazon Comprehend Detect PII Entities API\n",
@@ -148,21 +155,24 @@
148155
"cell_type": "code",
149156
"execution_count": null,
150157
"metadata": {
151-
"scrolled": true
158+
"scrolled": true,
159+
"tags": []
152160
},
153161
"outputs": [],
154162
"source": [
155163
"from textractoverlayer.t_overlay import DocumentDimensions, get_bounding_boxes\n",
156164
"\n",
157-
"def redact_doc(s3document, localpath, redact_entities):\n",
158-
" print(s3document)\n",
165+
"def redact_doc(localpath, redact_entities):\n",
159166
" try:\n",
160167
" img = PImage.open(localpath)\n",
161168
" draw = ImageDraw.Draw(img)\n",
162169
"\n",
163170
" # Use call_textract to get bounding boxes\n",
164171
" # call_textract without the features parameter uses Textract Detect text\n",
165-
" resp = call_textract(input_document = s3document)\n",
172+
" with open(localpath, 'rb') as document:\n",
173+
" imageBytes = bytearray(document.read())\n",
174+
" \n",
175+
" resp = call_textract(input_document = imageBytes)\n",
166176
" document_dimension:DocumentDimensions = DocumentDimensions(doc_width=img.size[0], doc_height=img.size[1])\n",
167177
" overlay=[Textract_Types.LINE, Textract_Types.WORD, Textract_Types.FORM, Textract_Types.CELL, Textract_Types.KEY, Textract_Types.VALUE]\n",
168178
" bounding_box_list = get_bounding_boxes(textract_json=resp, document_dimensions=[document_dimension], overlay_features=overlay)\n",
@@ -197,7 +207,7 @@
197207
" \n",
198208
" #Generate the redacted/enriched document file and save to file system\n",
199209
" opfile = Path(localpath).stem\n",
200-
" opfile = f'{opfile}_redacted.jpg' \n",
210+
" opfile = f'{opfile}_redacted.png' \n",
201211
" img.save(opfile) \n",
202212
" print(f'Done.... Redacted file saved: {opfile}')\n",
203213
" return opfile\n",
@@ -215,10 +225,12 @@
215225
{
216226
"cell_type": "code",
217227
"execution_count": null,
218-
"metadata": {},
228+
"metadata": {
229+
"tags": []
230+
},
219231
"outputs": [],
220232
"source": [
221-
"redact_doc(f's3://{data_bucket}/idp-mortgage/textract/Paystub.jpg','docs/Paystub.jpg',['NAME','SSN','DATE_TIME'])"
233+
"redact_doc('docs/Paystub.png',['NAME','SSN','DATE_TIME'])"
222234
]
223235
},
224236
{
@@ -232,11 +244,13 @@
232244
{
233245
"cell_type": "code",
234246
"execution_count": null,
235-
"metadata": {},
247+
"metadata": {
248+
"tags": []
249+
},
236250
"outputs": [],
237251
"source": [
238-
"bank_document_local='docs/Paystub.jpg'\n",
239-
"redacted_file='Paystub_redacted.jpg'\n",
252+
"bank_document_local='docs/Paystub.png'\n",
253+
"redacted_file='Paystub_redacted.png'\n",
240254
"\n",
241255
"print(f'\\nUnredacted Document\\t\\t\\t\\t\\t\\t\\tRedacted Document \\n')\n",
242256
"\n",
@@ -267,22 +281,6 @@
267281
"\n",
268282
"In order to clean up the files uploaded into the S3 bucket, execute the following command. If you created this SageMaker Domain Studio environment manually then follow the SageMaker documentation to delete the Studio domain. If you created, the Studio Domain using a CloudFormation stack, delete the stack. If you are performing this lab as part of an instructor led workshop please follow instructions shared by the instructor."
269283
]
270-
},
271-
{
272-
"cell_type": "code",
273-
"execution_count": null,
274-
"metadata": {},
275-
"outputs": [],
276-
"source": [
277-
"!aws s3 rm s3://{data_bucket}/idp-mortgage/ --recursive --exclude \"*\" --include \"textract/*\""
278-
]
279-
},
280-
{
281-
"cell_type": "code",
282-
"execution_count": null,
283-
"metadata": {},
284-
"outputs": [],
285-
"source": []
286284
}
287285
],
288286
"metadata": {

0 commit comments

Comments
 (0)