"""Check that determines the file type for every file in
the project and compares it to a blacklist of binary executable file
formats"""
import logging
import re
from collections import defaultdict
from collections.abc import Hashable
from pathlib import Path
from typing import Any, Optional
from fact_helper_file import get_file_type_from_path # type: ignore[import]
from git.repo import Repo
from src.config import context
from src.interfaces import CheckInterface
from src.opencode_git import clone_project
from .interfaces_checked_in_binaries import (
FileTypeInterface,
FileTypeToolInterface,
)
logger = logging.getLogger(__name__)
# Tool: fkie-cad / fact_helper_file
[docs]
class FactHelperFileFileType(FileTypeInterface):
[docs]
def __init__(self, ft: dict[str, str]) -> None:
self.mime: str = ft["mime"]
self.full: str = ft["full"]
[docs]
def _key(self) -> tuple[Hashable, ...]:
return (self.mime,)
[docs]
class FactHelperFile(FileTypeToolInterface):
[docs]
def file_type_of(self, file: Path) -> FactHelperFileFileType:
return FactHelperFileFileType(get_file_type_from_path(file))
# check
[docs]
class CheckedInBinaries(CheckInterface):
"""Represents a check that determies the file type for every file in
the project and compares it to a blacklist of binary executable file
formats"""
blacklist_dir: Path = context.settings["CheckedInBinaries_blacklist_dir"]
exclude: re.Pattern = re.compile("(^.git$|test)")
[docs]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fileTypeTools: list[type[FileTypeToolInterface]] = [FactHelperFile]
self.blacklist: set[FileTypeInterface] = set()
self.whitelist: set[FileTypeInterface] = {
FactHelperFileFileType(
{"mime": "application/octet-stream", "full": "data"}
)
}
if len(self.blacklist) == 0:
logger.info("Initializing blacklist of executable file formats")
self.__init_blacklist()
logger.info(f"Using blacklist {self.blacklist}")
self.all_tool_findings: list[
tuple[
type[FileTypeToolInterface],
dict[FileTypeInterface, list[Path]],
]
] = []
self.all_violations: dict[FileTypeInterface, list[Path]] = defaultdict(
list
)
def __update_blacklist(self) -> None:
if list(self.blacklist_dir.iterdir()):
# check for updates of an existing blacklist
logger.info("Existing blacklist found, updating...")
repo: Repo = Repo(self.blacklist_dir)
git = repo.git
git.fetch()
git.merge("--strategy-option", "theirs", "--no-edit")
git.pull("-X", "theirs")
else:
# fetch a new copy of the blacklist
logger.info("You have no blacklist, fetching current version ...")
clone_project(
context.settings["CheckedInBinaries_blacklist_repo"],
self.blacklist_dir,
)
def __init_blacklist(self) -> None:
self.__update_blacklist()
for bad_file in self.blacklist_dir.iterdir():
if "README.md" in bad_file.name or ".git" in bad_file.name:
continue
logger.debug(f"Processing known bad file {bad_file}")
for tool in self.fileTypeTools:
tool_instance: FileTypeToolInterface = tool()
file_type: FileTypeInterface = tool_instance.file_type_of(
bad_file
)
logger.debug(f"{tool} classfied {bad_file} as {file_type}")
if self.__is_too_generic(file_type):
# will produce too many false positives
logger.debug(
f"Not adding {file_type} to blacklist as it is "
" too generic"
)
continue
self.blacklist.add(file_type)
def __is_too_generic(self, file_type: FileTypeInterface) -> bool:
return file_type in self.whitelist
def __format_findings(
self, findings: dict[FileTypeInterface, list[Path]]
) -> dict[str, list[str]]:
ret: dict[str, list[str]] = {}
for file_type, paths in findings.items():
ret |= {str(file_type): [p.as_posix() for p in paths]}
return ret
[docs]
def _is_ok(self, file_type: FileTypeInterface) -> bool:
return file_type not in self.blacklist
[docs]
def _calc_score(
self,
) -> float:
return 0.0 if self.all_violations else 1.0
[docs]
def _determine_violations(self):
for _, findings in self.all_tool_findings:
for file_type, path_list in findings.items():
if len(path_list):
self.all_violations[file_type].extend(path_list)
[docs]
def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]:
ret: dict[str, Any] = super().run(args_dict)
self._run_all_tools()
self._determine_violations()
results: dict[str, Any] = {
"ft_paths": [
[file_type, paths]
for file_type, paths in self.__format_findings(
self.all_violations
).items()
]
}
assert self.results_valid(results)
return ret | {
"score": self._calc_score(),
"results": results,
}