Source code for src.checks.comments_in_code
"""Implementation of the "Comments in Code" check"""
import json
import logging
import shutil
import subprocess as sp
from collections import defaultdict
from pathlib import Path
from typing import Any, Optional, cast
from src.exceptions import CheckConstructionError
from src.interfaces import CheckInterface
logger: logging.Logger = logging.getLogger(__name__)
[docs]
class CommentsInCode(CheckInterface):
"""
Implementation of the "Comments in Code" check
This check essentially just runs `tokei` and divides comment lines by
combined comment and code lines. There is some additional logic to handle
programming languages.
"""
[docs]
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.tokei_to_linguist: dict[
str, Optional[str]
] = self.__load_tokei_to_linguist()
self.l_check: set[str] = self.__compute_l_check(self.tokei_to_linguist)
self.linguist: dict[str, float] = self.__fetch_linguist()
self.tokei: dict[str, dict[str, int]] = self.__run_tokei()
# I know this step is redundant but I do it anyways!
self.l_repo: set[str] = self.__compute_l_repo(
# for L(r) we only use what tokei and linguist detected
set(self.tokei.keys()).intersection(set(self.linguist.keys())),
self.l_check,
)
if not self.l_repo:
raise CheckConstructionError(
"Project contains no language supported by this check"
)
if not self.__have_tokei():
raise CheckConstructionError(
"Can not find tokei executable on PATH"
)
def __load_tokei_to_linguist(self) -> dict[str, Optional[str]]:
# generate with
# https://gitlab.opencode.de/OC000014832448/tokei_to_linguist
with (self._get_resource_dir() / "tokei_to_linguist.json").open() as f:
return json.load(f)
def __compute_l_check(
self, tokei_to_linguist: dict[str, Optional[str]]
) -> set[str]:
"""
Compute L_check via relation L_check = Im(A) \\ {None}
"""
return {str(x) for x in tokei_to_linguist.values() if x}
def __have_tokei(self) -> bool:
if shutil.which("tokei"):
return True
return False
def __fetch_linguist(self) -> dict[str, float]:
return {
str(k).lower(): float(v)
for k, v in cast(dict[str, Any], self.proj.languages()).items()
}
def __compute_l_repo(self, l_of_r: set[str], l_check: set[str]) -> set[str]:
return l_of_r.intersection(l_check)
def __run_tokei(self) -> dict[str, dict[str, int]]:
raw: sp.CompletedProcess[bytes] = sp.run(
["tokei", "-o", "json"],
capture_output=True,
check=True,
cwd=Path(str(self.repo.working_dir)),
)
tokei_results: dict[str, dict[str, int]] = defaultdict(
lambda: {
"code": 0,
"comments": 0,
}
)
for lang, stats in json.loads(raw.stdout).items():
# restricts the keys of self.tokei to L_linguist, and thus also to
# L_check (and since it comes from analyzing the repo, also L(r)),
# i.e., we compute A(l) and skip None
# grepme: change this to indexing once the bug in the script is
# fixed
lang_mapped: Optional[str] = self.tokei_to_linguist.get(
str(lang).lower()
)
if not lang_mapped:
continue
# The map A is not injective, thus we might need to update an
# existing entry, e.g., C headers and C source files both count
# to C language
tokei_results[lang_mapped]["code"] += int(stats["code"])
tokei_results[lang_mapped]["comments"] += int(stats["comments"])
return tokei_results
[docs]
def _tokei(self, lang: str) -> float:
"""
Map that takes a language in L_repo to its comments to code ratio
"""
ncomments: int = self.tokei[lang]["comments"]
ncode: int = self.tokei[lang]["code"]
return ncomments / (ncode + ncomments)
[docs]
def _sigma(self, value: float) -> float:
"""
Scoring function that receives the average comments to code ratio as an
input and maps it to the final score.
Needed since we cannot expect a project to be 100% comments to receive
a perfect score
"""
slope: int = 10
return value * slope if value < 1 / slope else 1.0
[docs]
def _compute_tokei(self) -> dict[str, float]:
"""
:return: The computed `tokei` map
"""
lang_ratios: dict[str, float] = {}
for lang in self.l_repo:
ratio: float = self._tokei(lang)
logger.info(f"C2C: {lang} - {ratio}")
lang_ratios |= {lang: ratio}
return lang_ratios
[docs]
def _compute_score(self, lang_ratios: dict[str, float]) -> float:
return self._sigma(
sum(lang_ratios[lang] * self.linguist[lang] for lang in self.l_repo)
/ sum(self.linguist[lang] for lang in self.l_repo)
)
[docs]
def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]:
ret: dict[str, Any] = super().run(args_dict)
lang_ratios: dict[str, float] = self._compute_tokei()
return {
"score": self._compute_score(lang_ratios),
"results": lang_ratios,
} | ret