Source code for src.checks.sast_usage_basic

"""Implementation of the SastUsageBasic check"""

from __future__ import annotations

import csv
import json
import logging
import re
from collections import defaultdict
from enum import Enum
from itertools import zip_longest
from pathlib import Path
from typing import Any, Optional, cast

import jsonschema

from src.config import context
from src.exceptions import CheckConstructionError
from src.interfaces import CheckInterface

logger: logging.Logger = logging.getLogger(__name__)



[docs]
class SastToolKind(Enum):
    """
    Enumerates the different classes of SAST tools that we differentiate between
    in our check. Each SastTool has one and it determines "how good" it is if
    we detect it in a project.
    """

    LINTER = 1
    SECURITY = 2
    SECRET = 3
    SCA = 4


[docs]
    @classmethod
    def weight(cls, kind: SastToolKind) -> float:
        """
        Encodes "how good" it is if we detect the tool in a project.
        """
        match kind:
            case SastToolKind.LINTER:
                return 0.5
            case SastToolKind.SECURITY:
                return 1
            case SastToolKind.SECRET:
                return 0.5
            case SastToolKind.SCA:
                return 0.5




# Check



[docs]
class SastTool:
    """
    Represents a tool that we can hope to detect in a project.
    """


[docs]
    def __init__(self, tool_json: dict[str, Any]):
        """
        :param tool_json: A map describing the tool; can be accessed by indexing the instance; domain is defined by the tool JSON schema
        """
        # Magic values for keys that store regular expressions. They are
        # automatically transformed to fixed regexes. Here are the ones that are not
        # parametrized by the tool. The others are constructed in the init function.
        #
        # note: empty fields are equivalent to $matchnothing
        self.DEFAULT_SPECIAL_REGEX_VALUES: dict[str, re.Pattern[str]] = {
            "$PDF_path_re": re.compile(r"\\."),
            "$PDF_name_re": re.compile(r"^\.gitlab-ci\.yml$"),
            "$PCH_path_re": re.compile(r"\."),
            "$PCH_name_re": re.compile(r"precommit"),
            "$CF_path_re": re.compile(r"\."),
            "$readme": re.compile(r"(README|[Rr]eadme)"),
            "$matchall": re.compile(r""),
            "$matchnothing": re.compile(r"(?!x)x"),
            "$rootdir": re.compile(r"\\."),
        }
        # Default language source file regex values: maps programming languages to
        # a regular expression that should recognize source files of that language
        # by name (usually file extension). Combined to form the source_file_regex
        # of a tool and then compiled. If there is no regex for a language it falls
        # back to a match-nothing regex.
        self.DEFAULT_LANGUAGE_REGEX_VALUES: dict[str, str] = defaultdict(
            lambda: r"(?!x)x",
            {
                "python": r"^.*?\.py$",
                "dockerfile": r"^Dockerfile$",
                "shell": r"^.*?\.sh$",
                "rust": r"^.*?\.rs$",
                "typescript": r"^.*?\.ts$",
                "tsx": r"^.*?\.tsx$",
                "javascript": r"^.*?\.js[x]$",
                "fluent": r"^.*?\.ftl$",
            },
        )

        self.name: str = tool_json["name"]
        self.kind: SastToolKind = SastToolKind[tool_json["kind"]]
        name_regex = re.compile(tool_json["name"])
        self.special_regex_values: dict[
            str, re.Pattern[str]
        ] = self.DEFAULT_SPECIAL_REGEX_VALUES | {
            "$PDF_data_re": name_regex,
            "$PCH_data_re": name_regex,
            "$DD_data_re": name_regex,
        }
        tool_json = self.__compile_regex(tool_json)
        tool_json = self.__add_source_file_regex(tool_json)
        self.tool_json: dict[str, Any] = tool_json


    def __add_source_file_regex(
        self, tool_json: dict[str, Any]
    ) -> dict[str, Any]:
        """
        Adds a `source_file_regex` key to the input map and populates the value
        with a regex that should match the names of source files of languages
        the input maps' `languages` array.

        :return: The updated map
        """
        tool_json["source_file_regex"] = re.compile(
            "("
            + "|".join(
                [
                    self.DEFAULT_LANGUAGE_REGEX_VALUES[language]
                    for language in tool_json["languages"]
                ]
            )
            + ")"
        )

        logging.info(
            f"{self.name} is using {tool_json['source_file_regex']} to find source files"
        )

        return tool_json

    def __compile_regex(self, tool_json: dict[str, Any]) -> dict[str, Any]:
        """
        Replaces string values in the input dict that represent regular
        expressions with their compiled versions.

        :return: The updated dict
        """
        non_regex_keys: set[str] = {
            "name",
            "description",
            "url",
            "languages",
            "stars",
            "applicable",
        }
        special_regex_value_prefix: str = "$"

        for k, v in tool_json.items():
            if k in non_regex_keys:
                continue

            if v.startswith(special_regex_value_prefix):
                # replace fields with magic values with their pre-compiled
                # regexes
                tool_json[k] = self.special_regex_values[v]
            elif v == "":
                # use the match-nothing regex for empty fields
                tool_json[k] = self.special_regex_values["$matchnothing"]
            else:
                tool_json[k] = re.compile("\\." if v == "." else v)

        return tool_json


[docs]
    def __getitem__(self, index: str) -> Any:
        return self.tool_json[index]



[docs]
    @classmethod
    def from_file_validate(cls, schema: dict[str, Any], file: Path) -> SastTool:
        """
        Constructs an instance from a JSON file describing a tool. Validates the
        file against the expected schema before using it.
        """
        with file.open(mode="r") as f:
            tool_json: dict[str, Any] = json.load(f)
            jsonschema.validate(tool_json, schema)
        return cls(tool_json)


    # pylint: disable-next=too-complex,too-many-return-statements

[docs]
    def check_file(self, f: Path) -> bool:
        """
        :return: True iff the file 'f' indicates that the SAST tool is being used in the project
        """
        logger.debug(f"Check {self['name']} on {f.name}")
        if not f.is_file():
            return False

        logstring = f"Detected {self.name} in {f} via strategy "
        # If the file is a source file of a supported language, see if we can
        # find tool-specific source code or comment artifacts.
        if self._check_file(f, self["source_file_regex"], self["SLD"]):
            logger.info(logstring + "SLD")
            return True

        # If the file looks like a tool-specific configuration file report the
        # tool as being present. Optionally check that the content indicates it
        # as well.
        if self._check_file(f, self["TCF_name"], self["TCF_data"]):
            logger.info(logstring + "TCF")
            return True

        # If the file looks like a pipeline/CI/CD/... definition file, check
        # the content to see if it looks like they run the tool.
        if self._check_file(f, self["PDF_name"], self["PDF_data"]):
            logger.info(logstring + "PDF")
            return True

        # If the file looks like a pre-commit hook definition file, check
        # the content to see if it looks like they run the tool.
        if self._check_file(f, self["PCH_name"], self["PCH_data"]):
            logger.info(logstring + "PCH")
            return True

        # If the file looks like a language-tooling configuration file, check
        # the content to see if it looks like they configure the tool in it.
        if self._check_file(f, self["CF_name"], self["CF_data"]):
            logger.info(logstring + "CF")
            return True

        # If the file looks like a Readme, check if they proudly present the
        # tool's badge in it.
        if self._check_file(f, self["BDG_name"], self["BDG_data"]):
            return True

        return False



[docs]
    def _check_file(
        self,
        f: Path,
        name_regex: Optional[re.Pattern[str]],
        content_regex: Optional[re.Pattern[str]] = None,
    ) -> bool:
        """
        :return: True iff a file's name and contents match the respective regular expressions; if no content regex is supplied only the filename is checked
        """
        if not name_regex or not re.search(name_regex, f.name):
            return False
        if not content_regex:
            logger.info(
                f"Filename matches {name_regex} and there is no content_regex"
            )
            return True

        with f.open(mode="r") as lines:
            for line in lines:
                if not re.search(content_regex, line):
                    continue
                logger.info(
                    f"Filename matches {name_regex} and <{line}> matches {content_regex}"
                )
                return True

        return False


    @property
    def weight(self) -> float:
        """
        Not all tools are equally good. Here we supply a rather arbitrary weight
        to influence how much effect the presence of the tool has on the final
        score. The higher the weight, the more I like the tool.

        :return: The weight
        """
        return SastToolKind.weight(self.kind)




[docs]
class SastUsageBasic(CheckInterface):
    exclude: re.Pattern[str] = re.compile("(^.git$|test)")


[docs]
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        self.tool_schema: dict[str, Any] = self.__load_tool_schema()
        self.__generate_tools()
        self.tools: list[SastTool] = self.__load_tools()
        self.lang_tools: dict[str, list[SastTool]] = self.__build_lang_tools()

        if not self.proj.languages():
            raise CheckConstructionError("Project conatins no languages?!")


    def __load_tool_schema(self) -> dict[str, Any]:
        """
        Loads the JSON schema of the tool definitions from permanent storage.
        config: tool_schema

        :return: JSON schema of a single tool
        """
        schema_path: Path = context.settings[f"{self.name()}_tool_schema"]
        assert schema_path.is_file()
        logger.info(
            f"Loading tool definition schema for {self.name()} from {schema_path}"
        )
        with schema_path.open(mode="r") as f:
            return json.load(f)

    def __generate_tools(self) -> None:
        """
        Generates the individual JSON tool definitions from a CSV file that
        describes all of them.

        config: tools_csv

        effect: populates the directory config::tools_dir

        note: no-op if directory is not empty
        """
        tools_dir: Path = context.settings[f"{self.name()}_tools_dir"]

        # grepme: Always generate for testing
        # if len(list(tools_dir.iterdir())) != 0:
        #    return

        tool_defs: Path = Path(context.settings[f"{self.name()}_tools_csv"])

        with tool_defs.open(mode="r", encoding="UTF-8") as f:
            reader = csv.reader(f, delimiter="\t")
            header: list[str] = next(reader)
            for row in reader:
                tool: dict[str, Any] = dict(
                    zip_longest(header, row, fillvalue="")
                )
                if tool["applicable"] == 0:
                    continue

                tool["languages"] = str(tool["languages"]).split(" ")

                jsonschema.validate(tool, self.tool_schema)
                tool_json: str = json.dumps(tool)
                logger.info(
                    f"""Generated tool: {(tools_dir / f"{tool['name']}.json").as_posix()}"""
                )
                (tools_dir / f"{tool['name']}.json").write_text(tool_json)

    def __load_tools(self) -> list[SastTool]:
        """
        Loads the JSON tool definitions from permanent storage.

        config: tools_dir

        :return: list of tools
        """
        tools_dir: Path = context.settings[f"{self.name()}_tools_dir"]
        assert tools_dir.is_dir()

        tools: list[SastTool] = [
            SastTool.from_file_validate(self.tool_schema, file)
            for file in tools_dir.iterdir()
        ]

        logger.info(f"Loaded tools {[t['name'] for t in tools]}")

        return tools

    def __build_lang_tools(self) -> dict[str, list[SastTool]]:
        """
        Constructs a mapping from programming languages to tools
        :return: The mapping
        """
        mapping: dict[str, list[SastTool]] = defaultdict(list)

        for tool in self.tools:
            for language in tool["languages"]:
                mapping[language].append(tool)

        logging.info(
            f"Built mapping: { {lng: [t['name'] for t in tools] for lng, tools in mapping.items()} }"
        )

        return mapping


[docs]
    def _detect_sast_tools(
        self,
    ) -> dict[str, list[SastTool]]:
        """
        Performs the actual "analysis". Builds map that takes programming
        languages to the set of SAST tools that the project uses for this
        language.

        :return: The mapping
        """
        detected_tools: dict[str, list[SastTool]] = defaultdict(list)
        for lang, tools in self.lang_tools.items():
            if lang not in [
                lng.lower()
                for lng in cast(dict[str, Any], self.proj.languages())
            ]:
                continue
            for f in self._gen_file_list():
                if not tools:
                    break
                # must create a copy iteration since we remove items
                for tool in list(tools):
                    if tool.check_file(f):
                        logger.info(
                            f"Removing tool {tool.name} for lanugage {lang}"
                        )
                        detected_tools[lang].append(tool)
                        tools.remove(tool)

        logger.info(
            "Detected SAST tools "
            f"{[(lang, [tool.name for tool in tools]) for lang, tools in detected_tools.items()]}"
            f" for project {self.proj.id}."
        )
        return detected_tools



[docs]
    def _calc_score(self, detected_tools: dict[str, list[SastTool]]) -> float:
        """
        Consumes the result of`_detect_sast_tools` and calculates the final
        score out of it.

        :return: score
        """
        score: float = 0.0
        for lang, lweight in cast(
            dict[str, Any], self.proj.languages()
        ).items():
            assert isinstance(lang, str)
            assert isinstance(lweight, float)

            lweight_normed: float = float(lweight) / 100
            logging.info(f"Language {lang} has weight {lweight_normed}")
            tweight: float = 0.0
            for tool in detected_tools[lang.lower()]:
                tweight = max(tweight, tool.weight)
            logging.info(f"Maximum tool weight is {tweight}")
            score += float(lweight_normed) * tweight

        return score



[docs]
    def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]:
        ret: dict[str, Any] = super().run(args_dict)
        detected_tools: dict[str, list[SastTool]] = self._detect_sast_tools()
        results: dict[str, Any] = {
            "lang_tools": [
                [lang, [tool.name for tool in tools]]
                for lang, tools in detected_tools.items()
            ],
        }
        assert self.results_valid(results)
        return {
            "score": self._calc_score(detected_tools),
            "results": results,
        } | ret
Source code for src.checks.sast_usage_basic

occmd

Navigation

Related Topics