Source code for credsweeper.ml_model.features.entropy_evaluation

import math
from typing import Dict, List, Set

import numpy as np

from credsweeper.common.constants import Chars, ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.data_content_provider import MIN_DATA_LEN
from credsweeper.ml_model.features.feature import Feature


[docs] class EntropyEvaluation(Feature): """ Renyi, Shannon entropy evaluation with Hartley entropy normalization. Augmentation with possible set of chars (hex, base64, etc.) Analyse only begin of the value See next link for details: https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf """ # Max size of ML analyzed value is ML_HUNK but value may be bigger HUNK_SIZE = 4 * ML_HUNK LOG2_CACHE: Dict[int, float] = {x: math.log2(x) for x in range(4, 4 * ML_HUNK + 1)} CHAR_SET: List[Set[str]] = [set(x.value) for x in Chars] RESULT_SIZE = 3 + len(Chars)
[docs] def extract(self, candidate: Candidate) -> np.ndarray: """Returns real entropy and possible sets of characters""" # only head of value will be analyzed result: np.ndarray = np.zeros(shape=EntropyEvaluation.RESULT_SIZE, dtype=np.float32) value = candidate.line_data_list[0].value[:EntropyEvaluation.HUNK_SIZE] size = len(value) uniq, counts = np.unique(list(value), return_counts=True) if MIN_DATA_LEN <= size: # evaluate the entropy for a value of at least 4 probabilities = counts / size hartley_entropy = EntropyEvaluation.LOG2_CACHE.get(size, -1.0) # renyi_entropy alpha=0.5 sum_prob_05 = np.sum(probabilities**0.5) renyi_entropy_05 = 2 * np.log2(sum_prob_05) result[0] = renyi_entropy_05 / hartley_entropy # shannon_entropy or renyi_entropy alpha=1 shannon_entropy = -np.sum(probabilities * np.log2(probabilities)) result[1] = shannon_entropy / hartley_entropy # renyi_entropy alpha=2 sum_prob_2 = np.sum(probabilities**2) renyi_entropy_2 = -1 * np.log2(sum_prob_2) result[2] = renyi_entropy_2 / hartley_entropy if 0 < size: # check charset for non-zero value # use the new variable to deal with mypy uniq_set = set(uniq) for n, i in enumerate(EntropyEvaluation.CHAR_SET, start=3): if not uniq_set.difference(i): result[n] = 1.0 return result