Source code for src.checks.checked_in_binaries

"""Check that determines the file type for every file in
the project and compares it to a blacklist of binary executable file
formats"""

import logging
import re
from collections import defaultdict
from collections.abc import Hashable
from pathlib import Path
from typing import Any, Optional

from fact_helper_file import get_file_type_from_path  # type: ignore[import]
from git.repo import Repo

from src.config import context
from src.interfaces import CheckInterface
from src.opencode_git import clone_project

from .interfaces_checked_in_binaries import (
    FileTypeInterface,
    FileTypeToolInterface,
)

logger = logging.getLogger(__name__)


# Tool: fkie-cad / fact_helper_file


[docs] class FactHelperFileFileType(FileTypeInterface):
[docs] def __init__(self, ft: dict[str, str]) -> None: self.mime: str = ft["mime"] self.full: str = ft["full"]
[docs] def _key(self) -> tuple[Hashable, ...]: return (self.mime,)
[docs] class FactHelperFile(FileTypeToolInterface):
[docs] def file_type_of(self, file: Path) -> FactHelperFileFileType: return FactHelperFileFileType(get_file_type_from_path(file))
# check
[docs] class CheckedInBinaries(CheckInterface): """Represents a check that determies the file type for every file in the project and compares it to a blacklist of binary executable file formats""" blacklist_dir: Path = context.settings["CheckedInBinaries_blacklist_dir"] exclude: re.Pattern = re.compile("(^.git$|test)")
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fileTypeTools: list[type[FileTypeToolInterface]] = [FactHelperFile] self.blacklist: set[FileTypeInterface] = set() self.whitelist: set[FileTypeInterface] = { FactHelperFileFileType( {"mime": "application/octet-stream", "full": "data"} ) } if len(self.blacklist) == 0: logger.info("Initializing blacklist of executable file formats") self.__init_blacklist() logger.info(f"Using blacklist {self.blacklist}") self.all_tool_findings: list[ tuple[ type[FileTypeToolInterface], dict[FileTypeInterface, list[Path]], ] ] = [] self.all_violations: dict[FileTypeInterface, list[Path]] = defaultdict( list )
def __update_blacklist(self) -> None: if list(self.blacklist_dir.iterdir()): # check for updates of an existing blacklist logger.info("Existing blacklist found, updating...") repo: Repo = Repo(self.blacklist_dir) git = repo.git git.fetch() git.merge("--strategy-option", "theirs", "--no-edit") git.pull("-X", "theirs") else: # fetch a new copy of the blacklist logger.info("You have no blacklist, fetching current version ...") clone_project( context.settings["CheckedInBinaries_blacklist_repo"], self.blacklist_dir, ) def __init_blacklist(self) -> None: self.__update_blacklist() for bad_file in self.blacklist_dir.iterdir(): if "README.md" in bad_file.name or ".git" in bad_file.name: continue logger.debug(f"Processing known bad file {bad_file}") for tool in self.fileTypeTools: tool_instance: FileTypeToolInterface = tool() file_type: FileTypeInterface = tool_instance.file_type_of( bad_file ) logger.debug(f"{tool} classfied {bad_file} as {file_type}") if self.__is_too_generic(file_type): # will produce too many false positives logger.debug( f"Not adding {file_type} to blacklist as it is " " too generic" ) continue self.blacklist.add(file_type) def __is_too_generic(self, file_type: FileTypeInterface) -> bool: return file_type in self.whitelist
[docs] def _run_all_tools( self, ) -> None: """For each available tool or library the set of detected file types is determined. All files with illegal file types are recorded.""" logger.info( "Running multi-tool file type detection for " f"project {self.proj.id}" ) for tool in self.fileTypeTools: logger.info( f"Using {tool.name()} to detect file types in project" f" {self.proj.id}" ) tool_instance: FileTypeToolInterface = tool() findings: defaultdict[FileTypeInterface, list[Path]] = defaultdict( list ) for file in self._gen_file_list(): file_type: FileTypeInterface = tool_instance.file_type_of(file) logger.debug(f"{tool.name()} classfied {file} as {file_type}") # only record files with disallowed file types if self._is_ok(file_type): # it is a defaultdict so we do this for its # side-effects # # pylint: disable-next=pointless-statement findings[file_type] else: logger.info( f"{file} with {file_type} is considered harmful" ) findings[file_type].append(file) logger.info( f"{tool.name()} detected file types " f"{self.__format_findings(findings)}" ) self.all_tool_findings.append((tool, findings)) logger.info( f"Results of file type detection: {self._format_findings()}" )
[docs] def _format_findings(self) -> dict[str, dict[str, list[str]]]: ret: dict[str, dict[str, list[str]]] = {} for tool, findings in self.all_tool_findings: ret |= {tool.name(): self.__format_findings(findings)} return ret
def __format_findings( self, findings: dict[FileTypeInterface, list[Path]] ) -> dict[str, list[str]]: ret: dict[str, list[str]] = {} for file_type, paths in findings.items(): ret |= {str(file_type): [p.as_posix() for p in paths]} return ret
[docs] def _is_ok(self, file_type: FileTypeInterface) -> bool: return file_type not in self.blacklist
[docs] def _calc_score( self, ) -> float: return 0.0 if self.all_violations else 1.0
[docs] def _determine_violations(self): for _, findings in self.all_tool_findings: for file_type, path_list in findings.items(): if len(path_list): self.all_violations[file_type].extend(path_list)
[docs] def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]: ret: dict[str, Any] = super().run(args_dict) self._run_all_tools() self._determine_violations() results: dict[str, Any] = { "ft_paths": [ [file_type, paths] for file_type, paths in self.__format_findings( self.all_violations ).items() ] } assert self.results_valid(results) return ret | { "score": self._calc_score(), "results": results, }