Source code for credsweeper.deep_scanner.docx_scanner

import io
import logging
from abc import ABC
from typing import List, Optional

import docx
from docx.document import Document
from docx.oxml import CT_P, CT_Tbl, CT_SectPr, CT_TcPr
from docx.section import Section, _Header, _Footer
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from lxml.etree import _Element

from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider

logger = logging.getLogger(__name__)



[docs]
class DocxScanner(AbstractScanner, ABC):
    """Implements docx scanning"""


[docs]
    @staticmethod
    def match(data: bytes | bytearray) -> bool:
        """Assume, ZIP prefix and common office files were checked before"""
        if b"word/document.xml" in data:
            return True
        return False


    @staticmethod
    def _iter_block_items(block):
        if isinstance(block, Paragraph):
            yield block
            return
        if isinstance(block, (_Header, _Footer)):
            for table in block.tables:
                for row in table.rows:
                    for cell in row.cells:
                        yield from DocxScanner._iter_block_items(cell)
            yield from block.paragraphs
            return
        if isinstance(block, Section):
            yield from DocxScanner._iter_block_items(block.header)
            yield from DocxScanner._iter_block_items(block.footer)
            return

        if isinstance(block, Document):
            parent_elm = block.element.body
        elif isinstance(block, _Cell):
            parent_elm = block._tc  # pylint: disable=W0212
        else:
            raise ValueError(f"unrecognised:{type(block)}")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, block)
            elif isinstance(child, CT_Tbl):
                table = Table(child, block)
                for row in table.rows:
                    for cell in row.cells:
                        yield from DocxScanner._iter_block_items(cell)
            elif isinstance(child, (CT_TcPr, CT_SectPr)):
                # config
                pass
            elif isinstance(child, _Element):
                yield child
            else:
                logger.warning("Unknown:%s", type(child))


[docs]
    def data_scan(
            self,  #
            data_provider: DataContentProvider,  #
            depth: int,  #
            recursive_limit_size: int) -> Optional[List[Candidate]]:
        """Tries to scan DOCX text with splitting by lines"""
        try:
            docx_lines: List[str] = []

            doc = docx.Document(io.BytesIO(data_provider.data))
            for block in self._iter_block_items(doc):
                if block.text:
                    docx_lines.append(block.text)

            header_lines_set = set()
            footer_lines_set = set()
            for section in doc.sections:
                for header in [section.first_page_header, section.even_page_header, section.header]:
                    for block in self._iter_block_items(header):
                        if block.text:
                            header_lines_set.add(block.text)
                for footer in [section.first_page_footer, section.even_page_footer, section.footer]:
                    for block in self._iter_block_items(footer):
                        if block.text:
                            footer_lines_set.add(block.text)
            docx_lines.extend(sorted(list(header_lines_set)))
            docx_lines.extend(sorted(list(footer_lines_set)))

            string_data_provider = StringContentProvider(lines=docx_lines,
                                                         file_path=data_provider.file_path,
                                                         file_type=data_provider.file_type,
                                                         info=f"{data_provider.info}|DOCX")
            docx_candidates = self.scanner.scan(string_data_provider)
            return docx_candidates

        except Exception as docx_exc:
            logger.warning("%s:%s", data_provider.file_path, docx_exc)
        return None