Source code for credsweeper.file_handler.file_path_extractor
import io
import logging
import os
from pathlib import Path
from typing import List, Dict, Union, Tuple
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
from credsweeper.common.constants import MIN_DATA_LEN
from credsweeper.config.config import Config
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
[docs]
class FilePathExtractor:
"""Util class to browse files in directories"""
FIND_BY_EXT_RULE = "Suspicious File Extension"
located_repos: Dict[Path, Repo] = {}
[docs]
@staticmethod
def apply_gitignore(detected_files: List[str]) -> List[str]:
"""Apply gitignore rules for each file.
Args:
detected_files: list of files to be checked
Return:
List of files with all files ignored by git removed
"""
filtered_files = [file_path for file_path in detected_files if FilePathExtractor.is_valid_path(file_path)]
return filtered_files
[docs]
@staticmethod
def get_file_paths(config: Config, path: Union[str, Path]) -> List[str]:
"""Get all files in the directory. Automatically exclude files non-code or data files (such as .jpg).
Args:
config: credsweeper configuration
path: path to the file or directory to be scanned
Return:
List all non-excluded files in the directory
"""
path = os.path.expanduser(path) # Replace ~ character with a full path to the home directory
if not os.path.exists(path):
logger.warning("Path '%s' does not exist", path)
file_paths = []
if os.path.isfile(path):
# suppose, the file is located outside and should be scanned
if not FilePathExtractor.check_exclude_file(config, path):
file_paths.append(path)
elif os.path.isdir(path):
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
file_path = os.path.join(f"{dirpath}", f"{filename}")
if FilePathExtractor.check_exclude_file(config, file_path) or os.path.islink(file_path):
continue
if os.path.isfile(file_path) and not FilePathExtractor.check_file_size(config, file_path):
file_paths.append(file_path)
else:
pass # symbolic links and so on
return file_paths
[docs]
@classmethod
def is_valid_path(cls, path: str) -> bool:
"""Locate nearest .git directory to the path and check if path is ignored.
Args:
path: path to the file or directory to check
Return:
False if file is ignored by git. True otherwise
"""
parent_directory = Path(path).parent
# Iterate over file path to find nearest ".git" directory
while True:
try:
if parent_directory in cls.located_repos:
repo = cls.located_repos[parent_directory]
else:
# The directory must have ".git" in it. If not it occurs error.
repo = Repo(parent_directory)
# Cache already located repositories, so we would not need to load it for each new file
cls.located_repos[parent_directory] = repo
# Return True if there is no ignored file in 'path' and False if any.
return len(repo.ignored(path)) == 0
except (InvalidGitRepositoryError, NoSuchPathError):
new_parent = parent_directory.parent
# If we encountered root and cannot move further: no .git directory located in the entire path
if new_parent == parent_directory:
return True
parent_directory = new_parent
[docs]
@staticmethod
def is_find_by_ext_file(config: Config, extension: str) -> bool:
"""
Checks whether file has suspicious extension
Args:
config: Config
extension: str - may be only file name with extension
Return:
True when the feature is configured and the file extension matches
"""
return config.find_by_ext and extension in config.find_by_ext_list
[docs]
@staticmethod
def check_exclude_file(config: Config, path: str) -> bool:
"""
Checks whether file should be excluded
Args:
config: Config
path: str - full path preferred
Return:
True when the file full path should be excluded according config
"""
if config.pedantic:
return False
path = path.replace('\\', '/')
lower_path = path.lower()
if config.not_allowed_path_pattern.match(lower_path):
return True
for exclude_pattern in config.exclude_patterns:
if exclude_pattern.match(lower_path):
return True
for exclude_path in config.exclude_paths:
# must be case-sensitive
if exclude_path in path:
return True
file_extension = Util.get_extension(lower_path, lower=False)
if file_extension in config.exclude_extensions:
return True
if not config.depth and file_extension in config.exclude_containers:
return True
# --depth or --doc enables scan for all documents extensions
if not (config.depth or config.doc) and file_extension in config.exclude_documents:
return True
return False
[docs]
@staticmethod
def check_file_size(config: Config, reference: Union[str, Path, io.BytesIO, Tuple[Union[str, Path],
io.BytesIO]]) -> bool:
"""
Checks whether the file is over the size limit from configuration or less MIN_DATA_LEN
Args:
config: Config
reference: various types of a file reference
Return:
True when the file is oversize or less than MIN_DATA_LEN, or unsupported
"""
path = reference[1] if isinstance(reference, tuple) else reference
if isinstance(path, (str, Path)):
file_size = os.path.getsize(path)
elif isinstance(path, io.BytesIO):
current_pos = path.tell()
path.seek(0, io.SEEK_END)
file_size = path.tell() - current_pos
path.seek(current_pos, io.SEEK_SET)
else:
logger.error("Unknown path type: %s", path)
return True
if MIN_DATA_LEN > file_size:
logger.debug("Size (%s) of the file '%s' is too small", file_size, path)
return True
if isinstance(config.size_limit, int) and config.size_limit < file_size:
logger.warning("Size (%s) of the file '%s' is over limit (%s)", file_size, path, config.size_limit)
return True
return False