Source code for src.checks.sast_usage_basic
"""Implementation of the SastUsageBasic check"""
from __future__ import annotations
import csv
import json
import logging
import re
from collections import defaultdict
from enum import Enum
from itertools import zip_longest
from pathlib import Path
from typing import Any, Optional, cast
import jsonschema
from src.config import context
from src.exceptions import CheckConstructionError
from src.interfaces import CheckInterface
logger: logging.Logger = logging.getLogger(__name__)
[docs]
class SastToolKind(Enum):
"""
Enumerates the different classes of SAST tools that we differentiate between
in our check. Each SastTool has one and it determines "how good" it is if
we detect it in a project.
"""
LINTER = 1
SECURITY = 2
SECRET = 3
SCA = 4
[docs]
@classmethod
def weight(cls, kind: SastToolKind) -> float:
"""
Encodes "how good" it is if we detect the tool in a project.
"""
match kind:
case SastToolKind.LINTER:
return 0.5
case SastToolKind.SECURITY:
return 1
case SastToolKind.SECRET:
return 0.5
case SastToolKind.SCA:
return 0.5
# Check
[docs]
class SastTool:
"""
Represents a tool that we can hope to detect in a project.
"""
[docs]
def __init__(self, tool_json: dict[str, Any]):
"""
:param tool_json: A map describing the tool; can be accessed by indexing the instance; domain is defined by the tool JSON schema
"""
# Magic values for keys that store regular expressions. They are
# automatically transformed to fixed regexes. Here are the ones that are not
# parametrized by the tool. The others are constructed in the init function.
#
# note: empty fields are equivalent to $matchnothing
self.DEFAULT_SPECIAL_REGEX_VALUES: dict[str, re.Pattern[str]] = {
"$PDF_path_re": re.compile(r"\\."),
"$PDF_name_re": re.compile(r"^\.gitlab-ci\.yml$"),
"$PCH_path_re": re.compile(r"\."),
"$PCH_name_re": re.compile(r"precommit"),
"$CF_path_re": re.compile(r"\."),
"$readme": re.compile(r"(README|[Rr]eadme)"),
"$matchall": re.compile(r""),
"$matchnothing": re.compile(r"(?!x)x"),
"$rootdir": re.compile(r"\\."),
}
# Default language source file regex values: maps programming languages to
# a regular expression that should recognize source files of that language
# by name (usually file extension). Combined to form the source_file_regex
# of a tool and then compiled. If there is no regex for a language it falls
# back to a match-nothing regex.
self.DEFAULT_LANGUAGE_REGEX_VALUES: dict[str, str] = defaultdict(
lambda: r"(?!x)x",
{
"python": r"^.*?\.py$",
"dockerfile": r"^Dockerfile$",
"shell": r"^.*?\.sh$",
"rust": r"^.*?\.rs$",
"typescript": r"^.*?\.ts$",
"tsx": r"^.*?\.tsx$",
"javascript": r"^.*?\.js[x]$",
"fluent": r"^.*?\.ftl$",
},
)
self.name: str = tool_json["name"]
self.kind: SastToolKind = SastToolKind[tool_json["kind"]]
name_regex = re.compile(tool_json["name"])
self.special_regex_values: dict[
str, re.Pattern[str]
] = self.DEFAULT_SPECIAL_REGEX_VALUES | {
"$PDF_data_re": name_regex,
"$PCH_data_re": name_regex,
"$DD_data_re": name_regex,
}
tool_json = self.__compile_regex(tool_json)
tool_json = self.__add_source_file_regex(tool_json)
self.tool_json: dict[str, Any] = tool_json
def __add_source_file_regex(
self, tool_json: dict[str, Any]
) -> dict[str, Any]:
"""
Adds a `source_file_regex` key to the input map and populates the value
with a regex that should match the names of source files of languages
the input maps' `languages` array.
:return: The updated map
"""
tool_json["source_file_regex"] = re.compile(
"("
+ "|".join(
[
self.DEFAULT_LANGUAGE_REGEX_VALUES[language]
for language in tool_json["languages"]
]
)
+ ")"
)
logging.info(
f"{self.name} is using {tool_json['source_file_regex']} to find source files"
)
return tool_json
def __compile_regex(self, tool_json: dict[str, Any]) -> dict[str, Any]:
"""
Replaces string values in the input dict that represent regular
expressions with their compiled versions.
:return: The updated dict
"""
non_regex_keys: set[str] = {
"name",
"description",
"url",
"languages",
"stars",
"applicable",
}
special_regex_value_prefix: str = "$"
for k, v in tool_json.items():
if k in non_regex_keys:
continue
if v.startswith(special_regex_value_prefix):
# replace fields with magic values with their pre-compiled
# regexes
tool_json[k] = self.special_regex_values[v]
elif v == "":
# use the match-nothing regex for empty fields
tool_json[k] = self.special_regex_values["$matchnothing"]
else:
tool_json[k] = re.compile("\\." if v == "." else v)
return tool_json
[docs]
@classmethod
def from_file_validate(cls, schema: dict[str, Any], file: Path) -> SastTool:
"""
Constructs an instance from a JSON file describing a tool. Validates the
file against the expected schema before using it.
"""
with file.open(mode="r") as f:
tool_json: dict[str, Any] = json.load(f)
jsonschema.validate(tool_json, schema)
return cls(tool_json)
# pylint: disable-next=too-complex,too-many-return-statements
[docs]
def check_file(self, f: Path) -> bool:
"""
:return: True iff the file 'f' indicates that the SAST tool is being used in the project
"""
logger.debug(f"Check {self['name']} on {f.name}")
if not f.is_file():
return False
logstring = f"Detected {self.name} in {f} via strategy "
# If the file is a source file of a supported language, see if we can
# find tool-specific source code or comment artifacts.
if self._check_file(f, self["source_file_regex"], self["SLD"]):
logger.info(logstring + "SLD")
return True
# If the file looks like a tool-specific configuration file report the
# tool as being present. Optionally check that the content indicates it
# as well.
if self._check_file(f, self["TCF_name"], self["TCF_data"]):
logger.info(logstring + "TCF")
return True
# If the file looks like a pipeline/CI/CD/... definition file, check
# the content to see if it looks like they run the tool.
if self._check_file(f, self["PDF_name"], self["PDF_data"]):
logger.info(logstring + "PDF")
return True
# If the file looks like a pre-commit hook definition file, check
# the content to see if it looks like they run the tool.
if self._check_file(f, self["PCH_name"], self["PCH_data"]):
logger.info(logstring + "PCH")
return True
# If the file looks like a language-tooling configuration file, check
# the content to see if it looks like they configure the tool in it.
if self._check_file(f, self["CF_name"], self["CF_data"]):
logger.info(logstring + "CF")
return True
# If the file looks like a Readme, check if they proudly present the
# tool's badge in it.
if self._check_file(f, self["BDG_name"], self["BDG_data"]):
return True
return False
[docs]
def _check_file(
self,
f: Path,
name_regex: Optional[re.Pattern[str]],
content_regex: Optional[re.Pattern[str]] = None,
) -> bool:
"""
:return: True iff a file's name and contents match the respective regular expressions; if no content regex is supplied only the filename is checked
"""
if not name_regex or not re.search(name_regex, f.name):
return False
if not content_regex:
logger.info(
f"Filename matches {name_regex} and there is no content_regex"
)
return True
with f.open(mode="r") as lines:
for line in lines:
if not re.search(content_regex, line):
continue
logger.info(
f"Filename matches {name_regex} and <{line}> matches {content_regex}"
)
return True
return False
@property
def weight(self) -> float:
"""
Not all tools are equally good. Here we supply a rather arbitrary weight
to influence how much effect the presence of the tool has on the final
score. The higher the weight, the more I like the tool.
:return: The weight
"""
return SastToolKind.weight(self.kind)
[docs]
class SastUsageBasic(CheckInterface):
exclude: re.Pattern[str] = re.compile("(^.git$|test)")
[docs]
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.tool_schema: dict[str, Any] = self.__load_tool_schema()
self.__generate_tools()
self.tools: list[SastTool] = self.__load_tools()
self.lang_tools: dict[str, list[SastTool]] = self.__build_lang_tools()
if not self.proj.languages():
raise CheckConstructionError("Project conatins no languages?!")
def __load_tool_schema(self) -> dict[str, Any]:
"""
Loads the JSON schema of the tool definitions from permanent storage.
config: tool_schema
:return: JSON schema of a single tool
"""
schema_path: Path = context.settings[f"{self.name()}_tool_schema"]
assert schema_path.is_file()
logger.info(
f"Loading tool definition schema for {self.name()} from {schema_path}"
)
with schema_path.open(mode="r") as f:
return json.load(f)
def __generate_tools(self) -> None:
"""
Generates the individual JSON tool definitions from a CSV file that
describes all of them.
config: tools_csv
effect: populates the directory config::tools_dir
note: no-op if directory is not empty
"""
tools_dir: Path = context.settings[f"{self.name()}_tools_dir"]
# grepme: Always generate for testing
# if len(list(tools_dir.iterdir())) != 0:
# return
tool_defs: Path = Path(context.settings[f"{self.name()}_tools_csv"])
with tool_defs.open(mode="r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t")
header: list[str] = next(reader)
for row in reader:
tool: dict[str, Any] = dict(
zip_longest(header, row, fillvalue="")
)
if tool["applicable"] == 0:
continue
tool["languages"] = str(tool["languages"]).split(" ")
jsonschema.validate(tool, self.tool_schema)
tool_json: str = json.dumps(tool)
logger.info(
f"""Generated tool: {(tools_dir / f"{tool['name']}.json").as_posix()}"""
)
(tools_dir / f"{tool['name']}.json").write_text(tool_json)
def __load_tools(self) -> list[SastTool]:
"""
Loads the JSON tool definitions from permanent storage.
config: tools_dir
:return: list of tools
"""
tools_dir: Path = context.settings[f"{self.name()}_tools_dir"]
assert tools_dir.is_dir()
tools: list[SastTool] = [
SastTool.from_file_validate(self.tool_schema, file)
for file in tools_dir.iterdir()
]
logger.info(f"Loaded tools {[t['name'] for t in tools]}")
return tools
def __build_lang_tools(self) -> dict[str, list[SastTool]]:
"""
Constructs a mapping from programming languages to tools
:return: The mapping
"""
mapping: dict[str, list[SastTool]] = defaultdict(list)
for tool in self.tools:
for language in tool["languages"]:
mapping[language].append(tool)
logging.info(
f"Built mapping: { {lng: [t['name'] for t in tools] for lng, tools in mapping.items()} }"
)
return mapping
[docs]
def _detect_sast_tools(
self,
) -> dict[str, list[SastTool]]:
"""
Performs the actual "analysis". Builds map that takes programming
languages to the set of SAST tools that the project uses for this
language.
:return: The mapping
"""
detected_tools: dict[str, list[SastTool]] = defaultdict(list)
for lang, tools in self.lang_tools.items():
if lang not in [
lng.lower()
for lng in cast(dict[str, Any], self.proj.languages())
]:
continue
for f in self._gen_file_list():
if not tools:
break
# must create a copy iteration since we remove items
for tool in list(tools):
if tool.check_file(f):
logger.info(
f"Removing tool {tool.name} for lanugage {lang}"
)
detected_tools[lang].append(tool)
tools.remove(tool)
logger.info(
"Detected SAST tools "
f"{[(lang, [tool.name for tool in tools]) for lang, tools in detected_tools.items()]}"
f" for project {self.proj.id}."
)
return detected_tools
[docs]
def _calc_score(self, detected_tools: dict[str, list[SastTool]]) -> float:
"""
Consumes the result of`_detect_sast_tools` and calculates the final
score out of it.
:return: score
"""
score: float = 0.0
for lang, lweight in cast(
dict[str, Any], self.proj.languages()
).items():
assert isinstance(lang, str)
assert isinstance(lweight, float)
lweight_normed: float = float(lweight) / 100
logging.info(f"Language {lang} has weight {lweight_normed}")
tweight: float = 0.0
for tool in detected_tools[lang.lower()]:
tweight = max(tweight, tool.weight)
logging.info(f"Maximum tool weight is {tweight}")
score += float(lweight_normed) * tweight
return score
[docs]
def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]:
ret: dict[str, Any] = super().run(args_dict)
detected_tools: dict[str, list[SastTool]] = self._detect_sast_tools()
results: dict[str, Any] = {
"lang_tools": [
[lang, [tool.name for tool in tools]]
for lang, tools in detected_tools.items()
],
}
assert self.results_valid(results)
return {
"score": self._calc_score(detected_tools),
"results": results,
} | ret