Source code for credsweeper.file_handler.file_path_extractor
import io
import logging
import os
from pathlib import Path
from typing import List, Dict, Union, Tuple
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
from credsweeper.config import Config
from credsweeper.utils import Util
logger = logging.getLogger(__name__)
[docs]
class FilePathExtractor:
"""Util class to browse files in directories"""
located_repos: Dict[Path, Repo] = {}
[docs]
@staticmethod
def apply_gitignore(detected_files: List[str]) -> List[str]:
"""Apply gitignore rules for each file.
Args:
detected_files: list of files to be checked
Return:
List of files with all files ignored by git removed
"""
filtered_files = [file_path for file_path in detected_files if FilePathExtractor.is_valid_path(file_path)]
return filtered_files
[docs]
@staticmethod
def get_file_paths(config: Config, path: Union[str, Path]) -> List[str]:
"""Get all files in the directory. Automatically exclude files non-code or data files (such as .jpg).
Args:
config: credsweeper configuration
path: path to the file or directory to be scanned
Return:
List all non-excluded files in the directory
"""
path = os.path.expanduser(path) # Replace ~ character with a full path to the home directory
if not os.path.exists(path):
logger.warning(f"'{path}' does not exist")
file_paths = []
if os.path.isfile(path):
# suppose, the file is located outside and should be scanned
if not FilePathExtractor.check_exclude_file(config, path):
file_paths.append(path)
elif os.path.isdir(path):
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
file_path = os.path.join(f"{dirpath}", f"{filename}")
if FilePathExtractor.check_exclude_file(config, file_path) \
or os.path.islink(file_path) \
or FilePathExtractor.check_file_size(config, file_path):
continue
if os.path.isfile(file_path) and 0 < os.path.getsize(file_path):
file_paths.append(file_path)
else:
pass # symbolic links and so on
return file_paths
[docs]
@classmethod
def is_valid_path(cls, path: str) -> bool:
"""Locate nearest .git directory to the path and check if path is ignored.
Args:
path: path to the file or directory to check
Return:
False if file is ignored by git. True otherwise
"""
parent_directory = Path(path).parent
# Iterate over file path to find nearest ".git" directory
while True:
try:
if parent_directory in cls.located_repos:
repo = cls.located_repos[parent_directory]
else:
# The directory must have ".git" in it. If not it occurs error.
repo = Repo(parent_directory)
# Cache already located repositories, so we would not need to load it for each new file
cls.located_repos[parent_directory] = repo
# Return True if there is no ignored file in 'path' and False if any.
return len(repo.ignored(path)) == 0
except (InvalidGitRepositoryError, NoSuchPathError):
new_parent = parent_directory.parent
# If we encountered root and cannot move further: no .git directory located in the entire path
if new_parent == parent_directory:
return True
parent_directory = new_parent
[docs]
@staticmethod
def is_find_by_ext_file(config: Config, extension: str) -> bool:
"""
Checks whether file has suspicious extension
Args:
config: Config
extension: str - may be only file name with extension
Return:
True when the feature is configured and the file extension matches
"""
return config.find_by_ext and extension in config.find_by_ext_list
[docs]
@staticmethod
def check_exclude_file(config: Config, path: str) -> bool:
"""
Checks whether file should be excluded
Args:
config: Config
path: str - full path preferred
Return:
True when the file full path should be excluded according config
"""
path = path.replace('\\', '/')
lower_path = path.lower()
if config.not_allowed_path_pattern.match(lower_path):
return True
for exclude_pattern in config.exclude_patterns:
if exclude_pattern.match(lower_path):
return True
for exclude_path in config.exclude_paths:
# must be case-sensitive
if exclude_path in path:
return True
file_extension = Util.get_extension(lower_path, lower=False)
if file_extension in config.exclude_extensions:
return True
if not config.depth and file_extension in config.exclude_containers:
return True
# --depth or --doc enables scan for all documents extensions
if not (config.depth or config.doc) and file_extension in config.exclude_documents:
return True
return False
[docs]
@staticmethod
def check_file_size(config: Config, reference: Union[str, Path, io.BytesIO, Tuple[Union[str, Path],
io.BytesIO]]) -> bool:
"""
Checks whether the file is over the size limit from configuration
Args:
config: Config
reference: various types of a file reference
Return:
True when the file is oversize
"""
if config.size_limit is None:
return False
file_size = None
path = reference[1] if isinstance(reference, tuple) else reference
if isinstance(path, str) or isinstance(path, Path):
file_size = os.path.getsize(path)
elif isinstance(path, io.BytesIO):
current_pos = path.tell()
path.seek(0, io.SEEK_END)
file_size = path.tell() - current_pos
path.seek(current_pos, io.SEEK_SET)
else:
logger.error(f"Unknown path type: {path}")
if file_size and file_size > config.size_limit:
logger.warning(f"Size ({file_size}) of the file '{path}' is over limit ({config.size_limit})")
return True
return False