credsweeper.file_handler package

Submodules

credsweeper.file_handler.abstract_provider module

class credsweeper.file_handler.abstract_provider.AbstractProvider(paths: Sequence[str | Path | BytesIO | Tuple[str | Path, BytesIO]])[source]

Bases: ABC

Base class for all files provider objects.

abstract get_scannable_files(config: Config) Sequence[ContentProvider][source]

Get list of file object for analysis based on attribute “paths”.

Parameters:

config – dict of credsweeper configuration

Returns:

file objects to analyse

property paths: Sequence[str | Path | BytesIO | Tuple[str | Path, BytesIO]]

paths getter

credsweeper.file_handler.analysis_target module

class credsweeper.file_handler.analysis_target.AnalysisTarget(line_pos: int, lines: List[str], line_nums: List[int], descriptor: Descriptor, line: str | None = None, offset: int | None = None)[source]

Bases: object

property descriptor: Descriptor

cached value

property file_path: str | None

cached value

property file_type: str | None

cached value

property info: str | None

cached value

property line: str

cached value

property line_len: int

cached value

property line_lower: str

cached value

property line_lower_strip: str

cached value

property line_num: int

cached value

property line_nums: List[int]

cached value

property line_pos: int

cached value

property line_strip: str

cached value

property line_strip_len: int

cached value

property lines: List[str]

cached value

property lines_len: int

cached value

property offset: int | None

cached value

credsweeper.file_handler.byte_content_provider module

class credsweeper.file_handler.byte_content_provider.ByteContentProvider(content: bytes, file_path: str | None = None, file_type: str | None = None, info: str | None = None)[source]

Bases: ContentProvider

Allow to scan byte sequence instead of extra reading a file

property data: bytes | None

data RO getter for ByteContentProvider

free() None[source]

free data after scan to reduce memory usage

property lines: List[str]

lines RO getter for ByteContentProvider

yield_analysis_target(min_len: int) Generator[AnalysisTarget, None, None][source]

Return lines to scan.

Parameters:

min_len – minimal line length to scan

Returns:

list of analysis targets based on every row in a content

credsweeper.file_handler.content_provider module

class credsweeper.file_handler.content_provider.ContentProvider(file_path: str | None = None, file_type: str | None = None, info: str | None = None)[source]

Bases: ABC

Base class to provide access to analysis targets for scanned object.

property data: bytes | None

abstract data getter

property descriptor: Descriptor

descriptor getter

property file_path: str

file_path getter

property file_type: str

file_type getter

abstract free() None[source]

free data after scan to reduce memory usage

property info: str

info getter

lines_to_targets(min_len: int, lines: List[str], line_nums: List[int] | None = None) Generator[AnalysisTarget, None, None][source]

Creates list of targets with multiline concatenation

abstract yield_analysis_target(min_len: int) Generator[AnalysisTarget, None, None][source]

Load and preprocess file diff data to scan.

Parameters:

min_len – minimal line length to scan

Returns:

row objects to analysing

credsweeper.file_handler.data_content_provider module

class credsweeper.file_handler.data_content_provider.DataContentProvider(data: bytes, file_path: str | None = None, file_type: str | None = None, info: str | None = None)[source]

Bases: ContentProvider

Dummy raw provider to keep bytes

property data: bytes | None

data RO getter for DataContentProvider and the property is used in deep scan

free() None[source]

free data after scan to reduce memory usage

represent_as_html(depth: int, recursive_limit_size: int, keywords_required_substrings_check: Callable[[str], bool]) bool | None[source]

Tries to read data as html

Returns:

True if reading was successful False if no data found None if the format is not acceptable

represent_as_structure() bool | None[source]

Tries to convert data with many parsers. Stores result to internal structure

Returns:

True if some structure found False if no data found None if the format is not acceptable

represent_as_xml() bool | None[source]

Tries to read data as xml

Returns:

True if reading was successful False if no data found None if the format is not acceptable

static simple_html_representation(html: BeautifulSoup) Tuple[List[int], List[str], int][source]

simple parse as it is displayed to user and appends the lines

property text: str

Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data

yield_analysis_target(min_len: int) Generator[AnalysisTarget, None, None][source]

Return nothing. The class provides only data storage.

Parameters:

min_len – minimal line length to scan

Raises:

NotImplementedError

credsweeper.file_handler.descriptor module

class credsweeper.file_handler.descriptor.Descriptor(path: str, extension: str, info: str)[source]

Bases: object

Descriptor for file - optimize memory consumption

extension: str
info: str
path: str

credsweeper.file_handler.diff_content_provider module

class credsweeper.file_handler.diff_content_provider.DiffContentProvider(file_path: str, change_type: DiffRowType, diff: List[DiffDict])[source]

Bases: ContentProvider

Provide data from a single .patch file.

Parameters:
  • file_path – path to file

  • change_type – set added or deleted file data to scan

  • diff

    list of file row changes, with base elements represented as:

    {
        "old": line number before diff,
        "new": line number after diff,
        "line": line text,
        "hunk": diff hunk number
    }
    

property data: bytes

data getter for DiffContentProvider

property diff: List[DiffDict]

diff getter for DiffContentProvider

free() None[source]

free data after scan to reduce memory usage

static parse_lines_data(change_type: DiffRowType, lines_data: List[DiffRowData]) Tuple[List[int], List[str]][source]

Parse diff lines data.

Return list of line numbers with change type “self.change_type” and list of all lines in file

in original order(replaced all lines not mentioned in diff file with blank line)

Parameters:
  • change_type – set added or deleted file data to scan

  • lines_data – data of all rows mentioned in diff file

Returns:

tuple of line numbers with change type “self.change_type” and all file lines in original order(replaced all lines not mentioned in diff file with blank line)

static patch2files_diff(raw_patch: List[str], change_type: DiffRowType) Dict[str, List[DiffDict]][source]

Generate files changes from patch for added or deleted filepaths.

Parameters:
  • raw_patch – git patch file content

  • change_type – change type to select, DiffRowType.ADDED or DiffRowType.DELETED

Returns:

return dict with {file paths: list of file row changes}, where elements of list of file row changes represented as:

{
    "old": line number before diff,
    "new": line number after diff,
    "line": line text,
    "hunk": diff hunk number
}

static preprocess_diff_rows(added_line_number: int | None, deleted_line_number: int | None, line: str) List[DiffRowData][source]

Auxiliary function to extend diff changes.

Parameters:
  • added_line_number – number of added line or None

  • deleted_line_number – number of deleted line or None

  • line – the text line

Returns:

diff rows data with as list of row change type, line number, row content

static preprocess_file_diff(changes: List[DiffDict]) List[DiffRowData][source]

Generate changed file rows from diff data with changed lines (e.g. marked + or - in diff).

Parameters:

changes – git diff by file rows data

Returns:

diff rows data with as list of row change type, line number, row content

static wrong_change(change: DiffDict) bool[source]

Returns True if the change is wrong

yield_analysis_target(min_len: int) Generator[AnalysisTarget, None, None][source]

Preprocess file diff data to scan.

Parameters:

min_len – minimal line length to scan

Returns:

list of analysis targets of every row of file diff corresponding to change type “self.change_type”

class credsweeper.file_handler.diff_content_provider.DiffDict

Bases: dict

hunk: Any
line: str | bytes
new: int | None
old: int | None
class credsweeper.file_handler.diff_content_provider.DiffRowData(line_type: DiffRowType, line_numb: int, line: str)[source]

Bases: object

Class for keeping data of diff row.

line: str
line_numb: int
line_type: DiffRowType

credsweeper.file_handler.file_path_extractor module

class credsweeper.file_handler.file_path_extractor.FilePathExtractor[source]

Bases: object

Util class to browse files in directories

FIND_BY_EXT_RULE = 'Suspicious File Extension'
static apply_gitignore(detected_files: List[str]) List[str][source]

Apply gitignore rules for each file.

Parameters:

detected_files – list of files to be checked

Returns:

List of files with all files ignored by git removed

static check_exclude_file(config: Config, path: str) bool[source]

Checks whether file should be excluded

Parameters:
  • config – Config

  • path – str - full path preferred

Returns:

True when the file full path should be excluded according config

static check_file_size(config: Config, reference: str | Path | BytesIO | Tuple[str | Path, BytesIO]) bool[source]

Checks whether the file is over the size limit from configuration or less MIN_DATA_LEN

Parameters:
  • config – Config

  • reference – various types of a file reference

Returns:

True when the file is oversize or less than MIN_DATA_LEN, or unsupported

static get_file_paths(config: Config, path: str | Path) List[str][source]

Get all files in the directory. Automatically exclude files non-code or data files (such as .jpg).

Parameters:
  • config – credsweeper configuration

  • path – path to the file or directory to be scanned

Returns:

List all non-excluded files in the directory

static is_find_by_ext_file(config: Config, extension: str) bool[source]

Checks whether file has suspicious extension

Parameters:
  • config – Config

  • extension – str - may be only file name with extension

Returns:

True when the feature is configured and the file extension matches

classmethod is_valid_path(path: str) bool[source]

Locate nearest .git directory to the path and check if path is ignored.

Parameters:

path – path to the file or directory to check

Returns:

False if file is ignored by git. True otherwise

located_repos: Dict[Path, Repo] = {}

credsweeper.file_handler.files_provider module

class credsweeper.file_handler.files_provider.FilesProvider(paths: Sequence[str | Path | BytesIO | Tuple[str | Path, BytesIO]], skip_ignored: bool | None = None)[source]

Bases: AbstractProvider

Provider of plain os files to be analysed.

get_scannable_files(config: Config) Sequence[ContentProvider][source]

Get list of full text file object for analysis of files with parent paths from “paths”.

Parameters:

config – dict of credsweeper configuration

Returns:

preprocessed file objects for analysis

credsweeper.file_handler.patches_provider module

class credsweeper.file_handler.patches_provider.PatchesProvider(paths: Sequence[str | Path | BytesIO | Tuple[str | Path, BytesIO]], change_type: DiffRowType)[source]

Bases: AbstractProvider

Provide data from a list of .patch files.

get_files_sequence(raw_patches: List[List[str]]) Sequence[ContentProvider][source]

Returns sequence of files

get_scannable_files(config: Config) Sequence[ContentProvider][source]

Get files to scan. Output based on the paths field.

Parameters:

config – dict of credsweeper configuration

Returns:

file objects for analysing

load_patch_data(config: Config) List[List[str]][source]

Loads data from patch

credsweeper.file_handler.string_content_provider module

class credsweeper.file_handler.string_content_provider.StringContentProvider(lines: List[str], line_numbers: List[int] | None = None, file_path: str | None = None, file_type: str | None = None, info: str | None = None)[source]

Bases: ContentProvider

Provider performs scan simple text lines

property data: bytes

data getter for StringContentProvider

free() None[source]

free data after scan to reduce memory usage

property line_numbers: List[int]

line_numbers RO getter for StringContentProvider

property lines: List[str]

line_numbers RO getter for StringContentProvider

yield_analysis_target(min_len: int) Generator[AnalysisTarget, None, None][source]

Return lines to scan.

Parameters:

min_len – minimal line length to scan

Returns:

list of analysis targets based on every row in file

credsweeper.file_handler.struct_content_provider module

class credsweeper.file_handler.struct_content_provider.StructContentProvider(struct: Any, file_path: str | None = None, file_type: str | None = None, info: str | None = None)[source]

Bases: ContentProvider

Content provider to keep structured data

property data: bytes

data getter for StructContentProvider

free() None[source]

free data after scan to reduce memory usage

property struct: Any

struct getter for StructContentProvider

yield_analysis_target(min_len: int) Generator[AnalysisTarget, None, None][source]

Return nothing. The class provides only data storage.

Parameters:

min_len – minimal line length to scan

Raises:

NotImplementedError

credsweeper.file_handler.text_content_provider module

class credsweeper.file_handler.text_content_provider.TextContentProvider(file_path: str | Path | Tuple[str | Path, BytesIO], file_type: str | None = None, info: str | None = None)[source]

Bases: ContentProvider

Provide access to analysis targets for full-text file scanning.

Parameters:

file_path – string, path to file

property data: bytes | None

data RO getter for TextContentProvider

free() None[source]

free data after scan to reduce memory usage

property lines: List[str] | None

lines getter for TextContentProvider

yield_analysis_target(min_len: int) Generator[AnalysisTarget, None, None][source]

Load and preprocess file content to scan.

Parameters:

min_len – minimal line length to scan

Returns:

list of analysis targets based on every row in file

Module contents