"""
This module contains the implementation of the Existence of Documentation
Infrastructure check.
The check performs a bunch of heuristics to estimate the amount of documentation
that exists for the piece of software developed in a repository.
"""
import logging
import re
from collections.abc import Iterable
from math import log
from pathlib import Path
from typing import Any, ClassVar, Optional
from urllib.parse import urlparse
from gitlab.exceptions import GitlabListError
from src.checks.interfaces_existence_of_documentation_infrastructure import (
DocumentationTypeInterface,
)
from src.interfaces import CheckInterface
from src.utils import dir_list, file_list
logger: logging.Logger = logging.getLogger(__name__)
"""
Def: "Plain in-tree documentation" is defined as software documentation that is
directly managed by the git vcs. In particular, no further steps are necessary
to obtain the final form of the documentation after a checkout of the repository
har been performed.
"""
[docs]
class PlainInTreeFile(DocumentationTypeInterface):
"""
Def: Plain in-tree documentation is said to be "file" if it is contained
within plain text files that are placed in the same (sub)tree as
non-documentation related files.
In practice this check does two things:
1. It has a simple whitelist of file names that are automatically
considered to contain documentation when they are found in the repository.
2. It searches text files in the repository for links that point to files
in the repository itself. It then uses those links to find the files
locally.
Finally, the set of all these files (set is deduplicated) is used to compute
the amount of documentation.
"""
#: Generated on local dump of OpenCoDE using (with some manually curation):
#: .. code block:: bash
#: for f in $(fd -t f -i --regex '\.(md|txt|rst)$'); do; \
#: b=$(basename "$f" | \
#: rg -vi \
#: '(license|changelog|security|contrib|test|release|conduct)'); \
#: [ ! -z $b ] && echo $b; done | \
#: sort | \
#: uniq -c | \
#: sort -n | \
#: tail -n 100
#:
DOC_FILE_NAME_WHITELIST: ClassVar = {
"config.json.md",
"configuration.md",
"gitworkflow.md",
"index.md",
"install.md",
"intro.md",
"languages.md",
"metadata.txt",
"modules.md",
"notes.txt",
"proxyconf.json.md",
"proxy.md",
"readme.de.md",
"readme.md",
"readme.rst",
"readme.txt",
"remoteinterface.md",
"rest-services.json.md",
"sensorthings.md",
"services.json.md",
"setupdev.md",
"setup.md",
"style.json.md",
"vuetutorial.md",
}
[docs]
def _url_to_file(self, url: str) -> Optional[Path]:
"""
Tries its very best to convert a url (probably a link to a file in the
remote GitLab repository) to a local file in our checkout.
It is kinda important to keep in mind that the input is entirely
untrusted and path traversal issues must be avoided (even though we
would only open the file and report its number of characters).
:param url: url that points to some file in the remote repo
:return: local copy of the file
"""
name: str = urlparse(url).path.split("/")[-1]
files: list[Path] = list(
file_list(self.repo, file_name_filter=lambda x: x != name)
)
logger.info(f"Mapped: {url} => {files}")
return None if len(files) != 1 else files[0]
[docs]
def _find_doc_files_from_links(self) -> set[Path]:
"""
Checks for links to documentation that point back to the repository
itself, both in the publiccode.yml and in text files. Then it tries to
find the respective files locally.
:return: Set of local documentation files that were found in that way.
"""
urls: list[str] = [
link.url
for link in (
self._docs_in_publiccodeyml(only_internal=True)
+ self._collect_doc_links(only_internal=True)
)
]
logger.info(f"Found doc urls: {urls}")
return {
file for file in [self._url_to_file(url) for url in urls] if file
}
[docs]
@staticmethod
def _doc_file_filter(file_name: str) -> bool:
"""
Used to filter out all non-documentation files when iterating over a
repository.
:param file_name: name of the file to decide
:return: True iff file should be skipped
"""
return file_name.lower() not in PlainInTreeFile.DOC_FILE_NAME_WHITELIST
[docs]
def delta(self) -> tuple[float, int]:
doc_files: set[Path] = set()
doc_files |= set(self._find_doc_files_from_links())
doc_files |= set(
file_list(self.repo, file_name_filter=self._doc_file_filter)
)
logger.info(f"Found documentation files: {doc_files}")
# I am pretty confident that there are few FPs here, but also that there
# are plenty of FNs.
return 1.0, self._amount(doc_files)
[docs]
class PlainInTreeFolder(DocumentationTypeInterface):
"""
Def: Plain in-tree documentation is said to be "folder" if there exists a
subtree that is solely comprised of documentation.
Im practice, we simply look for folders that are named something like
`*doc*`. We then recursively count characters in this subtree (text files
only).
"""
#: used to match directory names that contain documentation
DOC_FOLDER_RE = re.compile(r".*?do[ck](s|umenta|$).*?", re.IGNORECASE)
[docs]
@classmethod
def _doc_dir_predicate(cls, dir_name: str) -> bool:
"""
:param dir_name: name of the directory to decide
:return: True iff the directory name indicates that the directory
holds documentation.
"""
return bool(cls.DOC_FOLDER_RE.search(dir_name))
[docs]
def _find_doc_dirs(self) -> Iterable[Path]:
"""Returns all subtrees that are likely to hold only documentation."""
tmp = list(dir_list(self.repo, self._doc_dir_predicate))
logger.info(f" Found documentation directories: {tmp}")
return tmp
[docs]
def _count_docs(self) -> int:
"""
:return: number of non-whitespace characters in some kinds of text files
that live within directories that maybe contain documentation.
"""
return sum(
self._amount(files)
for files in [
file_list(self.repo, self._text_file_filter, root=doc_dir)
for doc_dir in self._find_doc_dirs()
]
)
[docs]
def delta(self) -> tuple[float, int]:
# not really sure which confidence to assign here ... the heuristic
# works kinda well, however, I don't really trust it so use something
# "in between good and meh"
return 0.77, self._count_docs()
[docs]
class OutOfTreeExternal(DocumentationTypeInterface):
"""
Def: "External out-of-tree documentation" is defined as any software
documentation that can not be generated from the contents of the source code
repository of the software and is not integrated into a management software
that is wrapping the git repository. For example, this includes manually
curated documentation that is hosted on an external website.
In practice, this check does two things:
1. Regex: It searches links with something like "docs" in their preview text
within some kinds of text files. It then takes all links that are not
pointing at the repository itself and marks them as external documentation
(with low confidence).
2. It searches the 'publiccode.yml' and checks for keys that point at
documentation. If they do not point at the project, it counts them as
external documentation with high confidence.
As we can not (and don't want to) scrape the referenced websites in some
way, the "amount" of documentation behind an external link is just a
hard-coded value.
"""
#: If some docs are found, the `amount` returned by the `delta` method will
#: always evaluate to this score.
HARD_CODED_SCORE: float = 0.75
[docs]
def _get_amount(self) -> int:
"""
Generates a value for the amount of documentation that was found.
Since we do not want to scrape websites this is just some hard-coded
value that leads to a score we like.
:return: The amount that leads to the hard-coded score
"""
return int(
ExistenceOfDocumentationInfrastructure.sigma_inv(
self.HARD_CODED_SCORE
)
)
[docs]
def delta(self) -> tuple[float, int]:
# documentation detection via the publiccode.yml should be fairly
# reliable, which is why we can use a rather high confidence
if self._docs_in_publiccodeyml(only_external=True):
return 1.0, self._get_amount()
# scanning for links is pretty noisy, thus use a low confidence
if self._collect_doc_links(only_external=True):
return 0.1, self._get_amount()
# this whole heuristic is pretty bad, thus if we do not find anything
# it doesn't really mean anything definitive
return 0.5, 0
[docs]
class OutOfTreeWiki(DocumentationTypeInterface):
"""
Def: "Wiki out-of-tree documentation" is defined as any software
documentation that is integrated into the management software that wraps the
software's git repository. In particular, this encompasses documentation
that was generated from the source code repository and then made available
via this method.
In the case of OpenCoDE, we check the Wiki pages of a project.
API: https://python-gitlab.readthedocs.io/en/stable/gl_objects/wikis.html
"""
[docs]
def _fetch_wiki_pages(self) -> dict[str, int]:
"""
:return: mapping of wiki page names to number of non-whitespace
characters on the page
"""
ret: dict[str, int] = {}
try:
pages = self.api.wikis.list()
for page in pages:
content: str = self._remove_whitespace(
str(self.api.wikis.get(page.slug).content)
)
ret |= {str(page.title): len(content)}
logger.info(
f"{self.api.name_with_namespace} has {len(pages)} wiki pages"
)
except GitlabListError as E:
logger.error(
f"Failed to list wikis for {self.api.name_with_namespace}: {E}"
)
return ret
[docs]
def delta(self) -> tuple[float, int]:
return 1.0, sum(self._fetch_wiki_pages().values())
# Reminder: Also check for GitLab Pages once we have privileged API access
[docs]
class ExistenceOfDocumentationInfrastructure(CheckInterface):
"""
Implementation of the Existence of Documentation Infrastructure check.
The class only contains the high-level logic of this check. It computes the
mapping `delta` with the help of the specialized documentation type classes
and then performs the score calculation based on that.
"""
#: exponent that sets the scale on which the sigma function rises from
# 0 to 1. The value below ensure that 1000 chars docs lead to a score of
#: 0.5.
DEFAULT_RISE: float = 9.967226258835993
[docs]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
#: Specialized classes for detecting and counting the different kinds of
#: documentation that we defined.
self.DOC_TYPES: list[type[DocumentationTypeInterface]] = [
OutOfTreeWiki,
OutOfTreeExternal,
PlainInTreeFile,
PlainInTreeFolder,
]
[docs]
@staticmethod
def _sigma(x: int, a: float = DEFAULT_RISE) -> float:
r"""
:param x: amount [0,+\infty)
:return: score in [0,1)
"""
return 1 - pow(1 + x, -1 / a)
[docs]
@staticmethod
def sigma_inv(y: float, a: float = DEFAULT_RISE) -> float:
r"""
:param y: score in [0,1)
:return: amount in [0, +\infty)
"""
return pow(1 - y, -a) - 1
[docs]
@staticmethod
def _sigma_inv_2(x: int, y: float) -> float:
r"""
:param x: amount in [0, +\infty)
:param y: score in [0,1)
:float: exponent such that y = \sigma(x)
"""
return -log(x + 1, 1 - y)
[docs]
def _compute_delta(self) -> list[tuple[str, float, int]]:
"""
:return: Pre-computed mapping delta for the current repository.
"""
return [
(t.name(), *t(self.repo, self.proj).delta()) for t in self.DOC_TYPES
]
[docs]
def _score(self, delta: list[tuple[str, float, int]]) -> float:
"""
:return: The final score of the current repository.
"""
return max(c * self._sigma(a) for _, c, a in delta)
[docs]
def _detailed_results(
self, delta: list[tuple[str, float, int]]
) -> dict[str, list[dict[str, Any]]]:
"""
:return: Check specific results with some details about the different
kinds of documentation that were detected.
"""
return {
"documentation_type_information": [
{"name": n, "confidence": c, "amount": a} for n, c, a in delta
]
}
[docs]
def run(self, args_dict: Optional[dict[str, Any]] = None) -> dict[str, Any]:
ret: dict[str, Any] = super().run(args_dict)
delta: list[tuple[str, float, int]] = self._compute_delta()
return {
"score": self._score(delta),
"results": self._detailed_results(delta),
} | ret