Source code for credsweeper.ml_model.features.entropy_evaluation

import math
from typing import Dict, List, Set

import numpy as np

from credsweeper.common.constants import Chars, ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.data_content_provider import MIN_DATA_LEN
from credsweeper.ml_model.features.feature import Feature



[docs]
class EntropyEvaluation(Feature):
    """
    Renyi, Shannon entropy evaluation with Hartley entropy normalization.
    Augmentation with possible set of chars (hex, base64, etc.)
    Analyse only begin of the value

    See next link for details:
    https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf

    """

    # Max size of ML analyzed value is ML_HUNK but value may be bigger
    HUNK_SIZE = 4 * ML_HUNK
    LOG2_CACHE: Dict[int, float] = {x: math.log2(x) for x in range(4, 4 * ML_HUNK + 1)}
    CHAR_SET: List[Set[str]] = [set(x.value) for x in Chars]
    RESULT_SIZE = 3 + len(Chars)


[docs]
    def extract(self, candidate: Candidate) -> np.ndarray:
        """Returns real entropy and possible sets of characters"""
        # only head of value will be analyzed
        result: np.ndarray = np.zeros(shape=EntropyEvaluation.RESULT_SIZE, dtype=np.float32)
        value = candidate.line_data_list[0].value[:EntropyEvaluation.HUNK_SIZE]
        size = len(value)
        uniq, counts = np.unique(list(value), return_counts=True)
        if MIN_DATA_LEN <= size:
            # evaluate the entropy for a value of at least 4
            probabilities = counts / size
            hartley_entropy = EntropyEvaluation.LOG2_CACHE.get(size, -1.0)

            # renyi_entropy alpha=0.5
            sum_prob_05 = np.sum(probabilities**0.5)
            renyi_entropy_05 = 2 * np.log2(sum_prob_05)
            result[0] = renyi_entropy_05 / hartley_entropy

            # shannon_entropy or renyi_entropy alpha=1
            shannon_entropy = -np.sum(probabilities * np.log2(probabilities))
            result[1] = shannon_entropy / hartley_entropy

            # renyi_entropy alpha=2
            sum_prob_2 = np.sum(probabilities**2)
            renyi_entropy_2 = -1 * np.log2(sum_prob_2)
            result[2] = renyi_entropy_2 / hartley_entropy

        if 0 < size:
            # check charset for non-zero value
            # use the new variable to deal with mypy
            uniq_set = set(uniq)
            for n, i in enumerate(EntropyEvaluation.CHAR_SET, start=3):
                if not uniq_set.difference(i):
                    result[n] = 1.0

        return result