Source code for src.checks.existence_of_documentation_infrastructure

"""
This module contains the implementation of the Existence of Documentation
Infrastructure check.

The check performs a bunch of heuristics to estimate the amount of documentation
that exists for the piece of software developed in a repository.
"""

import logging
import re
from collections.abc import Iterable
from math import log
from pathlib import Path
from typing import Any, ClassVar, Optional
from urllib.parse import urlparse

from gitlab.exceptions import GitlabListError

from src.checks.interfaces_existence_of_documentation_infrastructure import (
    DocumentationTypeInterface,
)
from src.interfaces import CheckInterface
from src.utils import dir_list, file_list

logger: logging.Logger = logging.getLogger(__name__)

"""
Def: "Plain in-tree documentation" is defined as software documentation that is
directly managed by the git vcs. In particular, no further steps are necessary
to obtain the final form of the documentation after a checkout of the repository
har been performed.
"""


[docs] class PlainInTreeFile(DocumentationTypeInterface): """ Def: Plain in-tree documentation is said to be "file" if it is contained within plain text files that are placed in the same (sub)tree as non-documentation related files. In practice this check does two things: 1. It has a simple whitelist of file names that are automatically considered to contain documentation when they are found in the repository. 2. It searches text files in the repository for links that point to files in the repository itself. It then uses those links to find the files locally. Finally, the set of all these files (set is deduplicated) is used to compute the amount of documentation. """ #: Generated on local dump of OpenCoDE using (with some manually curation): #: .. code block:: bash #: for f in $(fd -t f -i --regex '\.(md|txt|rst)$'); do; \ #: b=$(basename "$f" | \ #: rg -vi \ #: '(license|changelog|security|contrib|test|release|conduct)'); \ #: [ ! -z $b ] && echo $b; done | \ #: sort | \ #: uniq -c | \ #: sort -n | \ #: tail -n 100 #: DOC_FILE_NAME_WHITELIST: ClassVar = { "config.json.md", "configuration.md", "gitworkflow.md", "index.md", "install.md", "intro.md", "languages.md", "metadata.txt", "modules.md", "notes.txt", "proxyconf.json.md", "proxy.md", "readme.de.md", "readme.md", "readme.rst", "readme.txt", "remoteinterface.md", "rest-services.json.md", "sensorthings.md", "services.json.md", "setupdev.md", "setup.md", "style.json.md", "vuetutorial.md", }
[docs] def _url_to_file(self, url: str) -> Optional[Path]: """ Tries its very best to convert a url (probably a link to a file in the remote GitLab repository) to a local file in our checkout. It is kinda important to keep in mind that the input is entirely untrusted and path traversal issues must be avoided (even though we would only open the file and report its number of characters). :param url: url that points to some file in the remote repo :return: local copy of the file """ name: str = urlparse(url).path.split("/")[-1] files: list[Path] = list( file_list(self.repo, file_name_filter=lambda x: x != name) ) logger.info(f"Mapped: {url} => {files}") return None if len(files) != 1 else files[0]
[docs] @staticmethod def _doc_file_filter(file_name: str) -> bool: """ Used to filter out all non-documentation files when iterating over a repository. :param file_name: name of the file to decide :return: True iff file should be skipped """ return file_name.lower() not in PlainInTreeFile.DOC_FILE_NAME_WHITELIST
[docs] def delta(self) -> tuple[float, int]: doc_files: set[Path] = set() doc_files |= set(self._find_doc_files_from_links()) doc_files |= set( file_list(self.repo, file_name_filter=self._doc_file_filter) ) logger.info(f"Found documentation files: {doc_files}") # I am pretty confident that there are few FPs here, but also that there # are plenty of FNs. return 1.0, self._amount(doc_files)
[docs] class PlainInTreeFolder(DocumentationTypeInterface): """ Def: Plain in-tree documentation is said to be "folder" if there exists a subtree that is solely comprised of documentation. Im practice, we simply look for folders that are named something like `*doc*`. We then recursively count characters in this subtree (text files only). """ #: used to match directory names that contain documentation DOC_FOLDER_RE = re.compile(r".*?do[ck](s|umenta|$).*?", re.IGNORECASE)
[docs] @classmethod def _doc_dir_predicate(cls, dir_name: str) -> bool: """ :param dir_name: name of the directory to decide :return: True iff the directory name indicates that the directory holds documentation. """ return bool(cls.DOC_FOLDER_RE.search(dir_name))
[docs] def _find_doc_dirs(self) -> Iterable[Path]: """Returns all subtrees that are likely to hold only documentation.""" tmp = list(dir_list(self.repo, self._doc_dir_predicate)) logger.info(f" Found documentation directories: {tmp}") return tmp
[docs] def _count_docs(self) -> int: """ :return: number of non-whitespace characters in some kinds of text files that live within directories that maybe contain documentation. """ return sum( self._amount(files) for files in [ file_list(self.repo, self._text_file_filter, root=doc_dir) for doc_dir in self._find_doc_dirs() ] )
[docs] def delta(self) -> tuple[float, int]: # not really sure which confidence to assign here ... the heuristic # works kinda well, however, I don't really trust it so use something # "in between good and meh" return 0.77, self._count_docs()
[docs] class OutOfTreeExternal(DocumentationTypeInterface): """ Def: "External out-of-tree documentation" is defined as any software documentation that can not be generated from the contents of the source code repository of the software and is not integrated into a management software that is wrapping the git repository. For example, this includes manually curated documentation that is hosted on an external website. In practice, this check does two things: 1. Regex: It searches links with something like "docs" in their preview text within some kinds of text files. It then takes all links that are not pointing at the repository itself and marks them as external documentation (with low confidence). 2. It searches the 'publiccode.yml' and checks for keys that point at documentation. If they do not point at the project, it counts them as external documentation with high confidence. As we can not (and don't want to) scrape the referenced websites in some way, the "amount" of documentation behind an external link is just a hard-coded value. """ #: If some docs are found, the `amount` returned by the `delta` method will #: always evaluate to this score. HARD_CODED_SCORE: float = 0.75
[docs] def _get_amount(self) -> int: """ Generates a value for the amount of documentation that was found. Since we do not want to scrape websites this is just some hard-coded value that leads to a score we like. :return: The amount that leads to the hard-coded score """ return int( ExistenceOfDocumentationInfrastructure.sigma_inv( self.HARD_CODED_SCORE ) )
[docs] def delta(self) -> tuple[float, int]: # documentation detection via the publiccode.yml should be fairly # reliable, which is why we can use a rather high confidence if self._docs_in_publiccodeyml(only_external=True): return 1.0, self._get_amount() # scanning for links is pretty noisy, thus use a low confidence if self._collect_doc_links(only_external=True): return 0.1, self._get_amount() # this whole heuristic is pretty bad, thus if we do not find anything # it doesn't really mean anything definitive return 0.5, 0
[docs] class OutOfTreeWiki(DocumentationTypeInterface): """ Def: "Wiki out-of-tree documentation" is defined as any software documentation that is integrated into the management software that wraps the software's git repository. In particular, this encompasses documentation that was generated from the source code repository and then made available via this method. In the case of OpenCoDE, we check the Wiki pages of a project. API: https://python-gitlab.readthedocs.io/en/stable/gl_objects/wikis.html """
[docs] def _fetch_wiki_pages(self) -> dict[str, int]: """ :return: mapping of wiki page names to number of non-whitespace characters on the page """ ret: dict[str, int] = {} try: pages = self.api.wikis.list() for page in pages: content: str = self._remove_whitespace( str(self.api.wikis.get(page.slug).content) ) ret |= {str(page.title): len(content)} logger.info( f"{self.api.name_with_namespace} has {len(pages)} wiki pages" ) except GitlabListError as E: logger.error( f"Failed to list wikis for {self.api.name_with_namespace}: {E}" ) return ret
[docs] def delta(self) -> tuple[float, int]: return 1.0, sum(self._fetch_wiki_pages().values())
# Reminder: Also check for GitLab Pages once we have privileged API access
[docs] class ExistenceOfDocumentationInfrastructure(CheckInterface): """ Implementation of the Existence of Documentation Infrastructure check. The class only contains the high-level logic of this check. It computes the mapping `delta` with the help of the specialized documentation type classes and then performs the score calculation based on that. """ #: exponent that sets the scale on which the sigma function rises from # 0 to 1. The value below ensure that 1000 chars docs lead to a score of #: 0.5. DEFAULT_RISE: float = 9.967226258835993
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) #: Specialized classes for detecting and counting the different kinds of #: documentation that we defined. self.DOC_TYPES: list[type[DocumentationTypeInterface]] = [ OutOfTreeWiki, OutOfTreeExternal, PlainInTreeFile, PlainInTreeFolder, ]
[docs] @staticmethod def _sigma(x: int, a: float = DEFAULT_RISE) -> float: r""" :param x: amount [0,+\infty) :return: score in [0,1) """ return 1 - pow(1 + x, -1 / a)
[docs] @staticmethod def sigma_inv(y: float, a: float = DEFAULT_RISE) -> float: r""" :param y: score in [0,1) :return: amount in [0, +\infty) """ return pow(1 - y, -a) - 1
[docs] @staticmethod def _sigma_inv_2(x: int, y: float) -> float: r""" :param x: amount in [0, +\infty) :param y: score in [0,1) :float: exponent such that y = \sigma(x) """ return -log(x + 1, 1 - y)
[docs] def _compute_delta(self) -> list[tuple[str, float, int]]: """ :return: Pre-computed mapping delta for the current repository. """ return [ (t.name(), *t(self.repo, self.proj).delta()) for t in self.DOC_TYPES ]
[docs] def _score(self, delta: list[tuple[str, float, int]]) -> float: """ :return: The final score of the current repository. """ return max(c * self._sigma(a) for _, c, a in delta)
[docs] def _detailed_results( self, delta: list[tuple[str, float, int]] ) -> dict[str, list[dict[str, Any]]]: """ :return: Check specific results with some details about the different kinds of documentation that were detected. """ return { "documentation_type_information": [ {"name": n, "confidence": c, "amount": a} for n, c, a in delta ] }
[docs] def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]: ret: dict[str, Any] = super().run(args_dict) delta: list[tuple[str, float, int]] = self._compute_delta() return { "score": self._score(delta), "results": self._detailed_results(delta), } | ret