import contextlib
import logging
import re
from abc import ABC
from typing import List, Optional
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
# 8 bytes are encoded to 12 symbols 12345678 -> MTIzNDU2Nzg=
MIN_ENCODED_DATA_LEN = 12
[docs]
class EncoderScanner(AbstractScanner, ABC):
"""Implements recursive iteration when data might be encoded from base64"""
BASE64_PATTERN = re.compile(
rb"(\xFF\xFE|\xFE\xFF)?("
rb"(?:(?P<a>[A-Z])|(?P<b>[a-z])|(?P<c>[0-9/+])|[\s\x00\\])+(?(a)(?(b)(?(c)(=+|$)|(?!x)x)|(?!x)x)|(?!x)x)|"
rb"(?:(?P<e>[A-Z])|(?P<f>[a-z])|(?P<g>[0-9_-])|[\s\x00\\])+(?(e)(?(f)(?(g)(=+|$)|(?!x)x)|(?!x)x)|(?!x)x))")
[docs]
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Check if data MAY be base64 encoded with whitespaces (escaping too)"""
if len(data) >= MIN_ENCODED_DATA_LEN \
and EncoderScanner.BASE64_PATTERN.match(data, pos=0, endpos=MAX_LINE_LENGTH):
return True
return False
[docs]
@staticmethod
def decode(text: str) -> Optional[bytes]:
"""Decodes base64 text with cleaning whitespaces. Returns None when the decoding fails"""
with contextlib.suppress(Exception):
return Util.decode_base64(text=Util.PEM_CLEANING_PATTERN.sub(r'', text).replace('\\', ''),
padding_safe=True,
urlsafe_detect=True)
return None
[docs]
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to decode data from base64 encode to bytes and scan as bytes again"""
if decoded := EncoderScanner.decode(data_provider.text):
decoded_data_provider = DataContentProvider(data=decoded,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|BASE64")
new_limit = recursive_limit_size - len(decoded_data_provider.data)
return self.recursive_scan(decoded_data_provider, depth, new_limit)
return None