idank · tobiashochguertel · May 16, 2019 · Dec 31, 2020 · Jan 20, 2022 · Sep 14, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,15 @@
+.gitignore
+.git/
+.github/
+misc/
+tests/
+tools
+venv/
+dump/
+.mpypy_cache/
+*.pyc
+*.log
+README.md
+docker-compose.yml
+Dockerfile
+Makefile
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,7 @@
 *.swp
 .coverage
 .vagrant
-application.log
+*.log
+venv/
+__pycache__
+.mpypy_cache/
diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,15 @@
-FROM python:2.7
+FROM python:3.12
 
-RUN apt-get update \
-  && apt-get install man-db -y \
-  && apt-get clean
+RUN apt update \
+  && apt install man-db -y \
+  && apt clean
 
-ADD ./requirements.txt /tmp/requirements.txt
+WORKDIR /opt/webapp
+COPY . .
 
-RUN pip install --upgrade pip \
-  && python --version \
-  && pip install -r /tmp/requirements.txt \
-  && rm -rf ~/.cache/pip/*
+RUN pip3 install --no-cache-dir --no-warn-script-location --upgrade pip setuptools wheel virtualenv \
+  && pip3 install --no-cache-dir --no-warn-script-location -r requirements.txt
 
-ADD ./ /opt/webapp/
-WORKDIR /opt/webapp
 EXPOSE 5000
 
-CMD ["make", "serve"]
+CMD ["python3", "runserver.py"]
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 tests:
-	nosetests --exe --with-doctest tests/ explainshell/
+	pytest --doctest-modules tests/ explainshell/
 
 serve:
-	python runserver.py
+	docker-compose up --build
 
 .PHONY: tests
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ When querying explainshell, it:
 > 
 > If you're relying on manpages, be aware that they may not reflect the latest behavior. Contributions in this area are welcome but would require rethinking the documentation pipeline.
 
-Right now explainshell.com contains the entire [archive of Ubuntu](http://manpages.ubuntu.com/). It's not
+Right now explainshell.com contains the entire [archive of Ubuntu](https://manpages.ubuntu.com/). It's not
 possible to directly add a missing man page to the live site (it might be in the future).
 
 ## Running explainshell locally

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,10 +1,10 @@
-version: '2'
 services:
   db:
     image: mongo
   web:
-    build: .
-    command: make serve
+    build: 
+      context: .
+      dockerfile: Dockerfile
     environment:
       - MONGO_URI=mongodb://db
       - HOST_IP=0.0.0.0

diff --git a/explainshell/algo/classifier.py b/explainshell/algo/classifier.py
@@ -1,4 +1,6 @@
-import itertools, collections, logging
+import itertools
+import collections
+import logging
 
 import nltk
 import nltk.metrics
@@ -9,26 +11,32 @@
 
 logger = logging.getLogger(__name__)
 
+
 def get_features(paragraph):
     features = {}
-    ptext = paragraph.cleantext()
-    assert ptext
-
-    features['starts_with_hyphen'] = algo.features.starts_with_hyphen(ptext)
-    features['is_indented'] = algo.features.is_indented(ptext)
-    features['par_length'] = algo.features.par_length(ptext)
-    for w in ('=', '--', '[', '|', ','):
-        features['first_line_contains_%s' % w] = algo.features.first_line_contains(ptext, w)
-    features['first_line_length'] = algo.features.first_line_length(ptext)
-    features['first_line_word_count'] = algo.features.first_line_word_count(ptext)
-    features['is_good_section'] = algo.features.is_good_section(paragraph)
-    features['word_count'] = algo.features.word_count(ptext)
+    p_text = paragraph.clean_text()
+    logger.debug(f"length of p_text: {len(p_text)}")
+    assert p_text
+
+    features["starts_with_hyphen"] = algo.features.starts_with_hyphen(p_text)
+    features["is_indented"] = algo.features.is_indented(p_text)
+    features["par_length"] = algo.features.par_length(p_text)
+    for w in ("=", "--", "[", "|", ","):
+        features[f"first_line_contains_{w}"] = algo.features.first_line_contains(
+            p_text, w
+        )
+    features["first_line_length"] = algo.features.first_line_length(p_text)
+    features["first_line_word_count"] = algo.features.first_line_word_count(p_text)
+    features["is_good_section"] = algo.features.is_good_section(paragraph)
+    features["word_count"] = algo.features.word_count(p_text)
 
     return features
 
-class classifier(object):
-    '''classify the paragraphs of a man page as having command line options
-    or not'''
+
+class Classifier:
+    """classify the paragraphs of a man page as having command line options
+    or not"""
+
     def __init__(self, store, algo, **classifier_args):
         self.store = store
         self.algo = algo
@@ -39,59 +47,60 @@ def train(self):
         if self.classifier:
             return
 
-        manpages = self.store.trainingset()
+        man_pages = self.store.training_set()
 
         # flatten the manpages so we get a list of (manpage-name, paragraph)
         def flatten_manpages(manpage):
-            l = []
+            p_list = []
             for para in manpage.paragraphs:
-                l.append(para)
-            return l
-        paragraphs = itertools.chain(*[flatten_manpages(m) for m in manpages])
+                p_list.append(para)
+            return p_list
+
+        paragraphs = itertools.chain(*[flatten_manpages(m) for m in man_pages])
         training = list(paragraphs)
 
-        negids = [p for p in training if not p.is_option]
-        posids = [p for p in training if p.is_option]
+        neg_ids = [p for p in training if not p.is_option]
+        pos_ids = [p for p in training if p.is_option]
 
-        negfeats = [(get_features(p), False) for p in negids]
-        posfeats = [(get_features(p), True) for p in posids]
+        neg_feats = [(get_features(p), False) for p in neg_ids]
+        pos_feats = [(get_features(p), True) for p in pos_ids]
 
-        negcutoff = len(negfeats)*3/4
-        poscutoff = len(posfeats)*3/4
+        neg_cutoff = int(len(neg_feats) * 3 / 4)
+        pos_cutoff = int(len(pos_feats) * 3 / 4)
 
-        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
-        self.testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
+        train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
+        self.test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]
 
-        logger.info('train on %d instances', len(trainfeats))
+        logger.info("train on %d instances", len(train_feats))
 
-        if self.algo == 'maxent':
+        if self.algo == "maxent":
             c = nltk.classify.maxent.MaxentClassifier
-        elif self.algo == 'bayes':
+        elif self.algo == "bayes":
             c = nltk.classify.NaiveBayesClassifier
         else:
-            raise ValueError('unknown classifier')
+            raise ValueError("unknown classifier")
 
-        self.classifier = c.train(trainfeats, **self.classifier_args)
+        self.classifier = c.train(train_feats, **self.classifier_args)
 
     def evaluate(self):
         self.train()
-        refsets = collections.defaultdict(set)
-        testsets = collections.defaultdict(set)
+        ref_sets = collections.defaultdict(set)
+        test_sets = collections.defaultdict(set)
 
-        for i, (feats, label) in enumerate(self.testfeats):
-            refsets[label].add(i)
+        for i, (feats, label) in enumerate(self.test_feats):
+            ref_sets[label].add(i)
             guess = self.classifier.prob_classify(feats)
             observed = guess.max()
-            testsets[observed].add(i)
-            #if label != observed:
-            #    print 'label:', label, 'observed:', observed, feats
+            test_sets[observed].add(i)
+            # if label != observed:
+            #    print('label:', label, 'observed:', observed, feats
 
-        print 'pos precision:', nltk.metrics.precision(refsets[True], testsets[True])
-        print 'pos recall:', nltk.metrics.recall(refsets[True], testsets[True])
-        print 'neg precision:', nltk.metrics.precision(refsets[False], testsets[False])
-        print 'neg recall:', nltk.metrics.recall(refsets[False], testsets[False])
+        print("pos precision:", nltk.metrics.precision(ref_sets[True], test_sets[True]))
+        print("pos recall:", nltk.metrics.recall(ref_sets[True], test_sets[True]))
+        print("neg precision:", nltk.metrics.precision(ref_sets[False], test_sets[False]))
+        print("neg recall:", nltk.metrics.recall(ref_sets[False], test_sets[False]))
 
-        print self.classifier.show_most_informative_features(10)
+        print(self.classifier.show_most_informative_features(10))
 
     def classify(self, manpage):
         self.train()
@@ -102,10 +111,9 @@ def classify(self, manpage):
             option = guess.max()
             certainty = guess.prob(option)
 
-            if option:
-                if certainty < config.CLASSIFIER_CUTOFF:
-                    pass
-                else:
-                    logger.info('classified %s (%f) as an option paragraph', item, certainty)
-                    item.is_option = True
-                    yield certainty, item
+            if option and certainty >= config.CLASSIFIER_CUTOFF:
+                logger.info(
+                    "classified %s (%f) as an option paragraph", item, certainty
+                )
+                item.is_option = True
+                yield certainty, item
diff --git a/explainshell/algo/features.py b/explainshell/algo/features.py
@@ -1,7 +1,8 @@
 import re
 
+
 def extract_first_line(paragraph):
-    '''
+    """
     >>> extract_first_line('a b  cd')
     'a b'
     >>> extract_first_line('a b cd')
@@ -10,54 +11,63 @@ def extract_first_line(paragraph):
     'a b cd'
     >>> extract_first_line('  a b   cd')
     'a b'
-    '''
+    """
     lines = paragraph.splitlines()
     first = lines[0].strip()
-    spaces = list(re.finditer(r'(\s+)', first))
+    spaces = list(re.finditer(r"(\s+)", first))
     # handle options that have their description in the first line by trying
     # to treat it as two lines (looking at spaces between option and the rest
     # of the text)
     if spaces:
         longest = max(spaces, key=lambda m: m.span()[1] - m.span()[0])
         if longest and longest.start() > 1 and longest.end() - longest.start() > 1:
-            first = first[:longest.start()]
+            first = first[: longest.start()]
     return first
 
+
 def starts_with_hyphen(paragraph):
-    return paragraph.lstrip()[0] == '-'
+    return paragraph.lstrip()[0] == "-"
+
 
 def is_indented(paragraph):
     return paragraph != paragraph.lstrip()
 
+
 def par_length(paragraph):
     return round(len(paragraph.strip()), -1) / 2
 
+
 def first_line_contains(paragraph, what):
-    l = paragraph.splitlines()[0]
-    return what in l
+    ln = paragraph.splitlines()[0]
+    return what in ln
+
 
 def first_line_length(paragraph):
     first = extract_first_line(paragraph)
     return round(len(first), -1) / 2
 
+
 def first_line_word_count(paragraph):
     first = extract_first_line(paragraph)
     splitted = [s for s in first.split() if len(s) > 1]
 
     return round(len(splitted), -1)
 
+
 def is_good_section(paragraph):
     if not paragraph.section:
         return False
     s = paragraph.section.lower()
-    if 'options' in s:
+    if "options" in s:
         return True
-    if s in ('description', 'function letters'):
+    if s in ("description", "function letters"):
         return True
     return False
 
+
 def word_count(text):
-    return round(len(re.findall(r'\w+', text)), -1)
+    return round(len(re.findall(r"\w+", text)), -1)
+
 
 def has_bold(html):
-    return '<b>' in html
+    return "<b>" in html
diff --git a/explainshell/config.py b/explainshell/config.py
@@ -1,45 +1,14 @@
 import os
 
-_currdir = os.path.dirname(os.path.dirname(__file__))
+_curr_dir = os.path.dirname(os.path.dirname(__file__))
 
-MANPAGEDIR = os.path.join(_currdir, 'manpages')
+MAN_PAGE_DIR = os.path.join(_curr_dir, "manpages")
 CLASSIFIER_CUTOFF = 0.7
-TOOLSDIR = os.path.join(_currdir, 'tools')
+TOOLS_DIR = os.path.join(_curr_dir, "tools")
 
-MAN2HTML = os.path.join(TOOLSDIR, 'w3mman2html.cgi')
+MAN2HTML = os.path.join(TOOLS_DIR, "w3mman2html.cgi")
 
 # host to pass into Flask's app.run.
-HOST_IP = os.getenv('HOST_IP', False)
-MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost')
+HOST_IP = os.getenv("HOST_IP", "")
+MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost")
 DEBUG = True
-
-LOGGING_DICT = {
-    'version': 1,
-    'disable_existing_loggers': False,
-    'formatters': {
-        'standard': {
-            'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
-        },
-    },
-    'handlers': {
-        'console': {
-            'level' : 'INFO',
-            'class' : 'logging.StreamHandler',
-            'formatter': 'standard',
-        },
-        'file': {
-            'class': 'logging.FileHandler',
-            'level': 'INFO',
-            'formatter': 'standard',
-            'filename': 'application.log',
-            'mode': 'a',
-        },
-    },
-    'loggers': {
-        'explainshell': {
-            'handlers': ['console'],
-            'level': 'INFO',
-            'propagate': False
-        }
-    }
-}
diff --git a/explainshell/errors.py b/explainshell/errors.py
@@ -1,5 +1,6 @@
 class ProgramDoesNotExist(Exception):
     pass
 
+
 class EmptyManpage(Exception):
     pass