@@ -1350,7 +1350,8 @@ def _list_attachments(self) -> List[str]:
13501350 catalog = self .root_object
13511351 # From the catalog get the embedded file names
13521352 try :
1353- filenames = cast (
1353+ # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
1354+ names = cast (
13541355 ArrayObject ,
13551356 cast (
13561357 DictionaryObject ,
@@ -1359,8 +1360,23 @@ def _list_attachments(self) -> List[str]:
13591360 )
13601361 except KeyError :
13611362 return []
1362- attachments_names = [f for f in filenames if isinstance (f , str )]
1363- return attachments_names
1363+ attachment_names : List [str ] = []
1364+ for i , name in enumerate (names ):
1365+ if isinstance (name , str ):
1366+ attachment_names .append (name )
1367+ else :
1368+ name = name .get_object ()
1369+ for key in ["/UF" , "/F" ]:
1370+ # PDF 2.0 reference, table 43:
1371+ # > A PDF reader shall use the value of the UF key, when present, instead of the F key.
1372+ if key in name :
1373+ name = name [key ].get_object ()
1374+ if name == names [i - 1 ]:
1375+ # Avoid duplicates for the same entry.
1376+ continue
1377+ attachment_names .append (name )
1378+ break
1379+ return attachment_names
13641380
13651381 def _get_attachment_list (self , name : str ) -> List [bytes ]:
13661382 out = self ._get_attachments (name )[name ]
@@ -1389,7 +1405,8 @@ def _get_attachments(
13891405 catalog = self .root_object
13901406 # From the catalog get the embedded file names
13911407 try :
1392- filenames = cast (
1408+ # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
1409+ names = cast (
13931410 ArrayObject ,
13941411 cast (
13951412 DictionaryObject ,
@@ -1399,21 +1416,36 @@ def _get_attachments(
13991416 except KeyError :
14001417 return {}
14011418 attachments : Dict [str , Union [bytes , List [bytes ]]] = {}
1419+
14021420 # Loop through attachments
1403- for i in range (len (filenames )):
1404- f = filenames [i ]
1405- if isinstance (f , str ):
1406- if filename is not None and f != filename :
1407- continue
1408- name = f
1409- f_dict = filenames [i + 1 ].get_object ()
1410- f_data = f_dict ["/EF" ]["/F" ].get_data ()
1411- if name in attachments :
1412- if not isinstance (attachments [name ], list ):
1413- attachments [name ] = [attachments [name ]] # type:ignore
1414- attachments [name ].append (f_data ) # type:ignore
1421+ for i , name in enumerate (names ):
1422+ if isinstance (name , str ):
1423+ # Retrieve the corresponding reference.
1424+ file_dictionary = names [i + 1 ].get_object ()
1425+ else :
1426+ # We have the reference, but need to determine the name.
1427+ file_dictionary = name .get_object ()
1428+ for key in ["/UF" , "/F" ]:
1429+ # PDF 2.0 reference, table 43:
1430+ # > A PDF reader shall use the value of the UF key, when present, instead of the F key.
1431+ if key in file_dictionary :
1432+ name = file_dictionary [key ].get_object ()
1433+ break
14151434 else :
1416- attachments [name ] = f_data
1435+ continue
1436+ if name == names [i - 1 ]:
1437+ # Avoid extracting the same file twice.
1438+ continue
1439+
1440+ if filename is not None and name != filename :
1441+ continue
1442+ file_data = file_dictionary ["/EF" ]["/F" ].get_data ()
1443+ if name in attachments :
1444+ if not isinstance (attachments [name ], list ):
1445+ attachments [name ] = [attachments [name ]] # type:ignore
1446+ attachments [name ].append (file_data ) # type:ignore
1447+ else :
1448+ attachments [name ] = file_data
14171449 return attachments
14181450
14191451 @abstractmethod
0 commit comments