Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import warnings
import weakref
import zipfile
from operator import itemgetter

from . import extra

Expand Down Expand Up @@ -2923,6 +2924,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
self.is_encrypted = False
self.is_encrypted = False
self.metadata = None
self.has_duplicate_images = False
self.images_xrefs_by_page = None
self.FontInfos = []
self.Graftmaps = {}
self.ShownPages = {}
Expand Down Expand Up @@ -3047,6 +3050,30 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
self.page_count2 = extra.page_count_pdf
else:
self.page_count2 = extra.page_count_fz

# if the doc is a PDF, check for images duplication across pages
# this may happen, e.g., when converting from MS Office formats with external tools
# if the PDF has 1 page only, there is no possibility of duplication across pages
if self.is_pdf and self.page_count > 1:
has_duplicate_images = True
first_page_n_images = len(self.get_page_images(0))
for page in self.pages(start=1):
# we need at least one page with a different number of images
# to exclude full document duplication
if len(page.get_images()) != first_page_n_images:
has_duplicate_images = False
break
self.has_duplicate_images = has_duplicate_images

if self.has_duplicate_images:
self.images_xrefs_by_page = []
for page in self.pages():
# store only images referenced by page
page_xrefs = list(map(
itemgetter("xref"),
page.get_image_info(xrefs=True)
))
self.images_xrefs_by_page = page_xrefs
finally:
JM_mupdf_show_errors = JM_mupdf_show_errors_old

Expand Down Expand Up @@ -5090,7 +5117,14 @@ def get_page_images(self, pno: int, full: bool =False) -> list:
return ()
val = self._getPageInfo(pno, 2)
if not full:
return [v[:-1] for v in val]
val = [v[:-1] for v in val]
if self.has_duplicate_images:
deduplicated_val = []
for v in val:
# v[0] is "xref"
if v[0] in self.images_xrefs_by_page[pno]:
deduplicated_val.append(v)
return deduplicated_val
return val

def get_page_labels(self):
Expand Down
Loading