Source code for src.checks.sast_usage_basic

"""Implementation of the SastUsageBasic check"""

from __future__ import annotations

import csv
import json
import logging
import re
from collections import defaultdict
from enum import Enum
from itertools import zip_longest
from pathlib import Path
from typing import Any, Optional, cast

import jsonschema

from src.config import context
from src.exceptions import CheckConstructionError
from src.interfaces import CheckInterface

logger: logging.Logger = logging.getLogger(__name__)


[docs] class SastToolKind(Enum): """ Enumerates the different classes of SAST tools that we differentiate between in our check. Each SastTool has one and it determines "how good" it is if we detect it in a project. """ LINTER = 1 SECURITY = 2 SECRET = 3 SCA = 4
[docs] @classmethod def weight(cls, kind: SastToolKind) -> float: """ Encodes "how good" it is if we detect the tool in a project. """ match kind: case SastToolKind.LINTER: return 0.5 case SastToolKind.SECURITY: return 1 case SastToolKind.SECRET: return 0.5 case SastToolKind.SCA: return 0.5
# Check
[docs] class SastTool: """ Represents a tool that we can hope to detect in a project. """
[docs] def __init__(self, tool_json: dict[str, Any]): """ :param tool_json: A map describing the tool; can be accessed by indexing the instance; domain is defined by the tool JSON schema """ # Magic values for keys that store regular expressions. They are # automatically transformed to fixed regexes. Here are the ones that are not # parametrized by the tool. The others are constructed in the init function. # # note: empty fields are equivalent to $matchnothing self.DEFAULT_SPECIAL_REGEX_VALUES: dict[str, re.Pattern[str]] = { "$PDF_path_re": re.compile(r"\\."), "$PDF_name_re": re.compile(r"^\.gitlab-ci\.yml$"), "$PCH_path_re": re.compile(r"\."), "$PCH_name_re": re.compile(r"precommit"), "$CF_path_re": re.compile(r"\."), "$readme": re.compile(r"(README|[Rr]eadme)"), "$matchall": re.compile(r""), "$matchnothing": re.compile(r"(?!x)x"), "$rootdir": re.compile(r"\\."), } # Default language source file regex values: maps programming languages to # a regular expression that should recognize source files of that language # by name (usually file extension). Combined to form the source_file_regex # of a tool and then compiled. If there is no regex for a language it falls # back to a match-nothing regex. self.DEFAULT_LANGUAGE_REGEX_VALUES: dict[str, str] = defaultdict( lambda: r"(?!x)x", { "python": r"^.*?\.py$", "dockerfile": r"^Dockerfile$", "shell": r"^.*?\.sh$", "rust": r"^.*?\.rs$", "typescript": r"^.*?\.ts$", "tsx": r"^.*?\.tsx$", "javascript": r"^.*?\.js[x]$", "fluent": r"^.*?\.ftl$", }, ) self.name: str = tool_json["name"] self.kind: SastToolKind = SastToolKind[tool_json["kind"]] name_regex = re.compile(tool_json["name"]) self.special_regex_values: dict[ str, re.Pattern[str] ] = self.DEFAULT_SPECIAL_REGEX_VALUES | { "$PDF_data_re": name_regex, "$PCH_data_re": name_regex, "$DD_data_re": name_regex, } tool_json = self.__compile_regex(tool_json) tool_json = self.__add_source_file_regex(tool_json) self.tool_json: dict[str, Any] = tool_json
def __add_source_file_regex( self, tool_json: dict[str, Any] ) -> dict[str, Any]: """ Adds a `source_file_regex` key to the input map and populates the value with a regex that should match the names of source files of languages the input maps' `languages` array. :return: The updated map """ tool_json["source_file_regex"] = re.compile( "(" + "|".join( [ self.DEFAULT_LANGUAGE_REGEX_VALUES[language] for language in tool_json["languages"] ] ) + ")" ) logging.info( f"{self.name} is using {tool_json['source_file_regex']} to find source files" ) return tool_json def __compile_regex(self, tool_json: dict[str, Any]) -> dict[str, Any]: """ Replaces string values in the input dict that represent regular expressions with their compiled versions. :return: The updated dict """ non_regex_keys: set[str] = { "name", "description", "url", "languages", "stars", "applicable", } special_regex_value_prefix: str = "$" for k, v in tool_json.items(): if k in non_regex_keys: continue if v.startswith(special_regex_value_prefix): # replace fields with magic values with their pre-compiled # regexes tool_json[k] = self.special_regex_values[v] elif v == "": # use the match-nothing regex for empty fields tool_json[k] = self.special_regex_values["$matchnothing"] else: tool_json[k] = re.compile("\\." if v == "." else v) return tool_json
[docs] def __getitem__(self, index: str) -> Any: return self.tool_json[index]
[docs] @classmethod def from_file_validate(cls, schema: dict[str, Any], file: Path) -> SastTool: """ Constructs an instance from a JSON file describing a tool. Validates the file against the expected schema before using it. """ with file.open(mode="r") as f: tool_json: dict[str, Any] = json.load(f) jsonschema.validate(tool_json, schema) return cls(tool_json)
# pylint: disable-next=too-complex,too-many-return-statements
[docs] def check_file(self, f: Path) -> bool: """ :return: True iff the file 'f' indicates that the SAST tool is being used in the project """ logger.debug(f"Check {self['name']} on {f.name}") if not f.is_file(): return False logstring = f"Detected {self.name} in {f} via strategy " # If the file is a source file of a supported language, see if we can # find tool-specific source code or comment artifacts. if self._check_file(f, self["source_file_regex"], self["SLD"]): logger.info(logstring + "SLD") return True # If the file looks like a tool-specific configuration file report the # tool as being present. Optionally check that the content indicates it # as well. if self._check_file(f, self["TCF_name"], self["TCF_data"]): logger.info(logstring + "TCF") return True # If the file looks like a pipeline/CI/CD/... definition file, check # the content to see if it looks like they run the tool. if self._check_file(f, self["PDF_name"], self["PDF_data"]): logger.info(logstring + "PDF") return True # If the file looks like a pre-commit hook definition file, check # the content to see if it looks like they run the tool. if self._check_file(f, self["PCH_name"], self["PCH_data"]): logger.info(logstring + "PCH") return True # If the file looks like a language-tooling configuration file, check # the content to see if it looks like they configure the tool in it. if self._check_file(f, self["CF_name"], self["CF_data"]): logger.info(logstring + "CF") return True # If the file looks like a Readme, check if they proudly present the # tool's badge in it. if self._check_file(f, self["BDG_name"], self["BDG_data"]): return True return False
[docs] def _check_file( self, f: Path, name_regex: Optional[re.Pattern[str]], content_regex: Optional[re.Pattern[str]] = None, ) -> bool: """ :return: True iff a file's name and contents match the respective regular expressions; if no content regex is supplied only the filename is checked """ if not name_regex or not re.search(name_regex, f.name): return False if not content_regex: logger.info( f"Filename matches {name_regex} and there is no content_regex" ) return True with f.open(mode="r") as lines: for line in lines: if not re.search(content_regex, line): continue logger.info( f"Filename matches {name_regex} and <{line}> matches {content_regex}" ) return True return False
@property def weight(self) -> float: """ Not all tools are equally good. Here we supply a rather arbitrary weight to influence how much effect the presence of the tool has on the final score. The higher the weight, the more I like the tool. :return: The weight """ return SastToolKind.weight(self.kind)
[docs] class SastUsageBasic(CheckInterface): exclude: re.Pattern[str] = re.compile("(^.git$|test)")
[docs] def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.tool_schema: dict[str, Any] = self.__load_tool_schema() self.__generate_tools() self.tools: list[SastTool] = self.__load_tools() self.lang_tools: dict[str, list[SastTool]] = self.__build_lang_tools() if not self.proj.languages(): raise CheckConstructionError("Project conatins no languages?!")
def __load_tool_schema(self) -> dict[str, Any]: """ Loads the JSON schema of the tool definitions from permanent storage. config: tool_schema :return: JSON schema of a single tool """ schema_path: Path = context.settings[f"{self.name()}_tool_schema"] assert schema_path.is_file() logger.info( f"Loading tool definition schema for {self.name()} from {schema_path}" ) with schema_path.open(mode="r") as f: return json.load(f) def __generate_tools(self) -> None: """ Generates the individual JSON tool definitions from a CSV file that describes all of them. config: tools_csv effect: populates the directory config::tools_dir note: no-op if directory is not empty """ tools_dir: Path = context.settings[f"{self.name()}_tools_dir"] # grepme: Always generate for testing # if len(list(tools_dir.iterdir())) != 0: # return tool_defs: Path = Path(context.settings[f"{self.name()}_tools_csv"]) with tool_defs.open(mode="r", encoding="UTF-8") as f: reader = csv.reader(f, delimiter="\t") header: list[str] = next(reader) for row in reader: tool: dict[str, Any] = dict( zip_longest(header, row, fillvalue="") ) if tool["applicable"] == 0: continue tool["languages"] = str(tool["languages"]).split(" ") jsonschema.validate(tool, self.tool_schema) tool_json: str = json.dumps(tool) logger.info( f"""Generated tool: {(tools_dir / f"{tool['name']}.json").as_posix()}""" ) (tools_dir / f"{tool['name']}.json").write_text(tool_json) def __load_tools(self) -> list[SastTool]: """ Loads the JSON tool definitions from permanent storage. config: tools_dir :return: list of tools """ tools_dir: Path = context.settings[f"{self.name()}_tools_dir"] assert tools_dir.is_dir() tools: list[SastTool] = [ SastTool.from_file_validate(self.tool_schema, file) for file in tools_dir.iterdir() ] logger.info(f"Loaded tools {[t['name'] for t in tools]}") return tools def __build_lang_tools(self) -> dict[str, list[SastTool]]: """ Constructs a mapping from programming languages to tools :return: The mapping """ mapping: dict[str, list[SastTool]] = defaultdict(list) for tool in self.tools: for language in tool["languages"]: mapping[language].append(tool) logging.info( f"Built mapping: { {lng: [t['name'] for t in tools] for lng, tools in mapping.items()} }" ) return mapping
[docs] def _detect_sast_tools( self, ) -> dict[str, list[SastTool]]: """ Performs the actual "analysis". Builds map that takes programming languages to the set of SAST tools that the project uses for this language. :return: The mapping """ detected_tools: dict[str, list[SastTool]] = defaultdict(list) for lang, tools in self.lang_tools.items(): if lang not in [ lng.lower() for lng in cast(dict[str, Any], self.proj.languages()) ]: continue for f in self._gen_file_list(): if not tools: break # must create a copy iteration since we remove items for tool in list(tools): if tool.check_file(f): logger.info( f"Removing tool {tool.name} for lanugage {lang}" ) detected_tools[lang].append(tool) tools.remove(tool) logger.info( "Detected SAST tools " f"{[(lang, [tool.name for tool in tools]) for lang, tools in detected_tools.items()]}" f" for project {self.proj.id}." ) return detected_tools
[docs] def _calc_score(self, detected_tools: dict[str, list[SastTool]]) -> float: """ Consumes the result of`_detect_sast_tools` and calculates the final score out of it. :return: score """ score: float = 0.0 for lang, lweight in cast( dict[str, Any], self.proj.languages() ).items(): assert isinstance(lang, str) assert isinstance(lweight, float) lweight_normed: float = float(lweight) / 100 logging.info(f"Language {lang} has weight {lweight_normed}") tweight: float = 0.0 for tool in detected_tools[lang.lower()]: tweight = max(tweight, tool.weight) logging.info(f"Maximum tool weight is {tweight}") score += float(lweight_normed) * tweight return score
[docs] def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]: ret: dict[str, Any] = super().run(args_dict) detected_tools: dict[str, list[SastTool]] = self._detect_sast_tools() results: dict[str, Any] = { "lang_tools": [ [lang, [tool.name for tool in tools]] for lang, tools in detected_tools.items() ], } assert self.results_valid(results) return { "score": self._calc_score(detected_tools), "results": results, } | ret