Source code for src.checks.comments_in_code

"""Implementation of the "Comments in Code" check"""
import json
import logging
import shutil
import subprocess as sp
from collections import defaultdict
from pathlib import Path
from typing import Any, Optional, cast

from src.exceptions import CheckConstructionError
from src.interfaces import CheckInterface

logger: logging.Logger = logging.getLogger(__name__)



[docs]
class CommentsInCode(CheckInterface):
    """
    Implementation of the "Comments in Code" check

    This check essentially just runs `tokei` and divides comment lines by
    combined comment and code lines. There is some additional logic to handle
    programming languages.
    """


[docs]
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        self.tokei_to_linguist: dict[
            str, Optional[str]
        ] = self.__load_tokei_to_linguist()
        self.l_check: set[str] = self.__compute_l_check(self.tokei_to_linguist)
        self.linguist: dict[str, float] = self.__fetch_linguist()
        self.tokei: dict[str, dict[str, int]] = self.__run_tokei()
        # I know this step is redundant but I do it anyways!
        self.l_repo: set[str] = self.__compute_l_repo(
            # for L(r) we only use what tokei and linguist detected
            set(self.tokei.keys()).intersection(set(self.linguist.keys())),
            self.l_check,
        )

        if not self.l_repo:
            raise CheckConstructionError(
                "Project contains no language supported by this check"
            )

        if not self.__have_tokei():
            raise CheckConstructionError(
                "Can not find tokei executable on PATH"
            )


    def __load_tokei_to_linguist(self) -> dict[str, Optional[str]]:
        # generate with
        # https://gitlab.opencode.de/OC000014832448/tokei_to_linguist
        with (self._get_resource_dir() / "tokei_to_linguist.json").open() as f:
            return json.load(f)

    def __compute_l_check(
        self, tokei_to_linguist: dict[str, Optional[str]]
    ) -> set[str]:
        """
        Compute L_check via relation L_check = Im(A) \\ {None}
        """
        return {str(x) for x in tokei_to_linguist.values() if x}

    def __have_tokei(self) -> bool:
        if shutil.which("tokei"):
            return True
        return False

    def __fetch_linguist(self) -> dict[str, float]:
        return {
            str(k).lower(): float(v)
            for k, v in cast(dict[str, Any], self.proj.languages()).items()
        }

    def __compute_l_repo(self, l_of_r: set[str], l_check: set[str]) -> set[str]:
        return l_of_r.intersection(l_check)

    def __run_tokei(self) -> dict[str, dict[str, int]]:
        raw: sp.CompletedProcess[bytes] = sp.run(
            ["tokei", "-o", "json"],
            capture_output=True,
            check=True,
            cwd=Path(str(self.repo.working_dir)),
        )
        tokei_results: dict[str, dict[str, int]] = defaultdict(
            lambda: {
                "code": 0,
                "comments": 0,
            }
        )

        for lang, stats in json.loads(raw.stdout).items():
            # restricts the keys of self.tokei to L_linguist, and thus also to
            # L_check (and since it comes from analyzing the repo, also L(r)),
            # i.e., we compute A(l) and skip None
            # grepme: change this to indexing once the bug in the script is
            #   fixed
            lang_mapped: Optional[str] = self.tokei_to_linguist.get(
                str(lang).lower()
            )
            if not lang_mapped:
                continue

            # The map A is not injective, thus we might need to update an
            # existing entry, e.g., C headers and C source files both count
            # to C language
            tokei_results[lang_mapped]["code"] += int(stats["code"])
            tokei_results[lang_mapped]["comments"] += int(stats["comments"])

        return tokei_results


[docs]
    def _tokei(self, lang: str) -> float:
        """
        Map that takes a language in L_repo to its comments to code ratio
        """

        ncomments: int = self.tokei[lang]["comments"]
        ncode: int = self.tokei[lang]["code"]

        return ncomments / (ncode + ncomments)



[docs]
    def _sigma(self, value: float) -> float:
        """
        Scoring function that receives the average comments to code ratio as an
        input and maps it to the final score.

        Needed since we cannot expect a project to be 100% comments to receive
        a perfect score
        """
        slope: int = 10
        return value * slope if value < 1 / slope else 1.0



[docs]
    def _compute_tokei(self) -> dict[str, float]:
        """
        :return: The computed `tokei` map
        """
        lang_ratios: dict[str, float] = {}

        for lang in self.l_repo:
            ratio: float = self._tokei(lang)
            logger.info(f"C2C: {lang} - {ratio}")

            lang_ratios |= {lang: ratio}

        return lang_ratios



[docs]
    def _compute_score(self, lang_ratios: dict[str, float]) -> float:
        return self._sigma(
            sum(lang_ratios[lang] * self.linguist[lang] for lang in self.l_repo)
            / sum(self.linguist[lang] for lang in self.l_repo)
        )



[docs]
    def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]:
        ret: dict[str, Any] = super().run(args_dict)
        lang_ratios: dict[str, float] = self._compute_tokei()

        return {
            "score": self._compute_score(lang_ratios),
            "results": lang_ratios,
        } | ret
Source code for src.checks.comments_in_code

occmd

Navigation

Related Topics