|
27 | 27 | {
|
28 | 28 | "cell_type": "code",
|
29 | 29 | "execution_count": null,
|
30 |
| - "metadata": {}, |
| 30 | + "metadata": { |
| 31 | + "tags": [] |
| 32 | + }, |
31 | 33 | "outputs": [],
|
32 | 34 | "source": [
|
33 | 35 | "from textractcaller.t_call import call_textract, Textract_Features, Textract_Types\n",
|
|
39 | 41 | "cell_type": "code",
|
40 | 42 | "execution_count": null,
|
41 | 43 | "metadata": {
|
42 |
| - "scrolled": true |
| 44 | + "scrolled": true, |
| 45 | + "tags": [] |
43 | 46 | },
|
44 | 47 | "outputs": [],
|
45 | 48 | "source": [
|
|
93 | 96 | "metadata": {},
|
94 | 97 | "outputs": [],
|
95 | 98 | "source": [
|
96 |
| - "documentName = \"docs/Paystub.jpg\"\n", |
| 99 | + "documentName = \"docs/Paystub.png\"\n", |
97 | 100 | "display(Image(filename=documentName, width=500))"
|
98 | 101 | ]
|
99 | 102 | },
|
|
110 | 113 | "cell_type": "code",
|
111 | 114 | "execution_count": null,
|
112 | 115 | "metadata": {
|
113 |
| - "scrolled": true |
| 116 | + "scrolled": true, |
| 117 | + "tags": [] |
114 | 118 | },
|
115 | 119 | "outputs": [],
|
116 | 120 | "source": [
|
117 |
| - "resp = call_textract(input_document = f's3://{data_bucket}/idp-mortgage/textract/Paystub.jpg')\n", |
| 121 | + "with open(documentName, 'rb') as document:\n", |
| 122 | + " imageBytes = bytearray(document.read())\n", |
| 123 | + "\n", |
| 124 | + "resp = call_textract(input_document = imageBytes)\n", |
118 | 125 | "text = get_string(textract_json=resp, output_type=[Textract_Pretty_Print.LINES])\n",
|
119 | 126 | "\n",
|
120 | 127 | "#Call Amazon Comprehend Detect PII Entities API\n",
|
|
148 | 155 | "cell_type": "code",
|
149 | 156 | "execution_count": null,
|
150 | 157 | "metadata": {
|
151 |
| - "scrolled": true |
| 158 | + "scrolled": true, |
| 159 | + "tags": [] |
152 | 160 | },
|
153 | 161 | "outputs": [],
|
154 | 162 | "source": [
|
155 | 163 | "from textractoverlayer.t_overlay import DocumentDimensions, get_bounding_boxes\n",
|
156 | 164 | "\n",
|
157 |
| - "def redact_doc(s3document, localpath, redact_entities):\n", |
158 |
| - " print(s3document)\n", |
| 165 | + "def redact_doc(localpath, redact_entities):\n", |
159 | 166 | " try:\n",
|
160 | 167 | " img = PImage.open(localpath)\n",
|
161 | 168 | " draw = ImageDraw.Draw(img)\n",
|
162 | 169 | "\n",
|
163 | 170 | " # Use call_textract to get bounding boxes\n",
|
164 | 171 | " # call_textract without the features parameter uses Textract Detect text\n",
|
165 |
| - " resp = call_textract(input_document = s3document)\n", |
| 172 | + " with open(localpath, 'rb') as document:\n", |
| 173 | + " imageBytes = bytearray(document.read())\n", |
| 174 | + " \n", |
| 175 | + " resp = call_textract(input_document = imageBytes)\n", |
166 | 176 | " document_dimension:DocumentDimensions = DocumentDimensions(doc_width=img.size[0], doc_height=img.size[1])\n",
|
167 | 177 | " overlay=[Textract_Types.LINE, Textract_Types.WORD, Textract_Types.FORM, Textract_Types.CELL, Textract_Types.KEY, Textract_Types.VALUE]\n",
|
168 | 178 | " bounding_box_list = get_bounding_boxes(textract_json=resp, document_dimensions=[document_dimension], overlay_features=overlay)\n",
|
|
197 | 207 | " \n",
|
198 | 208 | " #Generate the redacted/enriched document file and save to file system\n",
|
199 | 209 | " opfile = Path(localpath).stem\n",
|
200 |
| - " opfile = f'{opfile}_redacted.jpg' \n", |
| 210 | + " opfile = f'{opfile}_redacted.png' \n", |
201 | 211 | " img.save(opfile) \n",
|
202 | 212 | " print(f'Done.... Redacted file saved: {opfile}')\n",
|
203 | 213 | " return opfile\n",
|
|
215 | 225 | {
|
216 | 226 | "cell_type": "code",
|
217 | 227 | "execution_count": null,
|
218 |
| - "metadata": {}, |
| 228 | + "metadata": { |
| 229 | + "tags": [] |
| 230 | + }, |
219 | 231 | "outputs": [],
|
220 | 232 | "source": [
|
221 |
| - "redact_doc(f's3://{data_bucket}/idp-mortgage/textract/Paystub.jpg','docs/Paystub.jpg',['NAME','SSN','DATE_TIME'])" |
| 233 | + "redact_doc('docs/Paystub.png',['NAME','SSN','DATE_TIME'])" |
222 | 234 | ]
|
223 | 235 | },
|
224 | 236 | {
|
|
232 | 244 | {
|
233 | 245 | "cell_type": "code",
|
234 | 246 | "execution_count": null,
|
235 |
| - "metadata": {}, |
| 247 | + "metadata": { |
| 248 | + "tags": [] |
| 249 | + }, |
236 | 250 | "outputs": [],
|
237 | 251 | "source": [
|
238 |
| - "bank_document_local='docs/Paystub.jpg'\n", |
239 |
| - "redacted_file='Paystub_redacted.jpg'\n", |
| 252 | + "bank_document_local='docs/Paystub.png'\n", |
| 253 | + "redacted_file='Paystub_redacted.png'\n", |
240 | 254 | "\n",
|
241 | 255 | "print(f'\\nUnredacted Document\\t\\t\\t\\t\\t\\t\\tRedacted Document \\n')\n",
|
242 | 256 | "\n",
|
|
267 | 281 | "\n",
|
268 | 282 | "In order to clean up the files uploaded into the S3 bucket, execute the following command. If you created this SageMaker Domain Studio environment manually then follow the SageMaker documentation to delete the Studio domain. If you created, the Studio Domain using a CloudFormation stack, delete the stack. If you are performing this lab as part of an instructor led workshop please follow instructions shared by the instructor."
|
269 | 283 | ]
|
270 |
| - }, |
271 |
| - { |
272 |
| - "cell_type": "code", |
273 |
| - "execution_count": null, |
274 |
| - "metadata": {}, |
275 |
| - "outputs": [], |
276 |
| - "source": [ |
277 |
| - "!aws s3 rm s3://{data_bucket}/idp-mortgage/ --recursive --exclude \"*\" --include \"textract/*\"" |
278 |
| - ] |
279 |
| - }, |
280 |
| - { |
281 |
| - "cell_type": "code", |
282 |
| - "execution_count": null, |
283 |
| - "metadata": {}, |
284 |
| - "outputs": [], |
285 |
| - "source": [] |
286 | 284 | }
|
287 | 285 | ],
|
288 | 286 | "metadata": {
|
|
0 commit comments