From 376c7d34e2c5d3737df37a0949964c046d554f4c Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 20 Sep 2023 14:46:36 +0900
Subject: [PATCH 01/15] suggest stackoverflow when the source is not found

---
 main.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index 50a727a..2007ef6 100644
--- a/main.py
+++ b/main.py
@@ -1,12 +1,17 @@
-import argparse, traceback
-from multiprocessing import Pool, cpu_count
-from utils import *
-from downloader import Stack_Exchange_Downloader
-from pairer import QA_Pairer
+import argparse
 import os
+import traceback
+import zipfile
 from itertools import repeat
+from multiprocessing import Pool, cpu_count
+
+import dotenv
 from lm_dataformat import Archive
-import zipfile
+
+from downloader import Stack_Exchange_Downloader
+from pairer import QA_Pairer
+
+dotenv.load_dotenv(override=True)
 
 
 def download_and_process_single(name, out_format, min_score, max_responses):
@@ -14,6 +19,10 @@ def download_and_process_single(name, out_format, min_score, max_responses):
         name = name.strip().lower()
         os.makedirs("dumps", exist_ok=True)
         s = Stack_Exchange_Downloader(name)
+        if name not in s.sites:
+            similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys()))
+            print("StackExchange source not found. Perhaps you meant", similar_entries)
+            return
         path_to_xml = "dumps/{}/Posts.xml".format(name)
         if name != "stackoverflow":
             path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
@@ -85,6 +94,7 @@ def main(args):
     parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
                                                 'Default 3.', type=int, default=3)
     args = parser.parse_args()
+
     main(args)
 
 

From bd442f6893ae254cf4062991fd3d006b75c330bb Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 20 Sep 2023 15:18:22 +0900
Subject: [PATCH 02/15] add command to print the list of sources

---
 README.md | 17 ++++++++++++++++-
 main.py   | 10 ++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2f9cb51..5df2be0 100644
--- a/README.md
+++ b/README.md
@@ -11,18 +11,33 @@ pip install -r requirements.txt
 ```
 # Usage
 
-To download *every* stackexchange dump & parse to text, simply run
+
+## List all available StackExchagne dumps
+
+```
+python3 main.py --list 
+```
+
+
+
+## Download every StackExchange dumps 
+
+To download *every* stackexchange dumps & parse to text, simply run
 
 ```
 python3 main.py --names all
 ```
 
+## Download a single StackExchange dump 
+
 To download only a single stackexchange, you can add the name as an optional argument. E.G: 
 
 ```
 python3 main.py --names security.stackexchange
 ```
 
+## Download a list of StackExchange dumps
+
 To download a list of multiple stackexchanges, you can add the names separated by commas. E.G:
 
 ```
diff --git a/main.py b/main.py
index 2007ef6..131653d 100644
--- a/main.py
+++ b/main.py
@@ -60,6 +60,12 @@ def download_and_process_single(name, out_format, min_score, max_responses):
 
 
 def main(args):
+    if args.list:
+        s = Stack_Exchange_Downloader("all")
+        print("List of all the sources of StackExchange: ")
+        print("- "+"\n- ".join(sorted(s.sites.keys())))
+        return
+
     names = args.names.split(',')
     if names[0].strip().lower() == "all":
         s = Stack_Exchange_Downloader("all")
@@ -81,6 +87,10 @@ def main(args):
     parser = argparse.ArgumentParser(
         description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
                     'question-answer pair text dataset for Language Models')
+
+    parser.add_argument('--list', help='list of all the sources from stackechange',
+                        required=False, action="store_true")
+
     parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
                                         'If "all", will download, extract & parse *every* stackoverflow site',
                         default="3dprinting.stackexchange,3dprinting.meta.stackexchange",

From 06149147267415e5f68c14fe784c821affac563b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 20 Sep 2023 15:29:43 +0900
Subject: [PATCH 03/15] add proxy support and .evn

---
 .gitignore |  2 ++
 README.md  | 27 ++++++++++++++++++++-------
 2 files changed, 22 insertions(+), 7 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3bf780b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.env
\ No newline at end of file
diff --git a/README.md b/README.md
index 5df2be0..6bd94b1 100644
--- a/README.md
+++ b/README.md
@@ -3,16 +3,16 @@ A python tool for downloading & processing the [stackexchange data dumps](https:
 
 Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar)
 
-# Setup
+## Setup
 ```
 git clone https://github.com/EleutherAI/stackexchange_dataset/
 cd stackexchange_dataset
 pip install -r requirements.txt
 ```
-# Usage
+## Usage
 
 
-## List all available StackExchagne dumps
+### List all available StackExchagne dumps
 
 ```
 python3 main.py --list 
@@ -20,7 +20,7 @@ python3 main.py --list
 
 
 
-## Download every StackExchange dumps 
+### Download every StackExchange dumps 
 
 To download *every* stackexchange dumps & parse to text, simply run
 
@@ -28,7 +28,7 @@ To download *every* stackexchange dumps & parse to text, simply run
 python3 main.py --names all
 ```
 
-## Download a single StackExchange dump 
+### Download a single StackExchange dump 
 
 To download only a single stackexchange, you can add the name as an optional argument. E.G: 
 
@@ -36,7 +36,7 @@ To download only a single stackexchange, you can add the name as an optional arg
 python3 main.py --names security.stackexchange
 ```
 
-## Download a list of StackExchange dumps
+### Download a list of StackExchange dumps
 
 To download a list of multiple stackexchanges, you can add the names separated by commas. E.G:
 
@@ -46,7 +46,7 @@ python3 main.py --names ru.stackoverflow,money.stackexchange
 
 The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange).
 
-## All Usage Options:
+### All Usage Options:
 
 ```
 usage: main.py [-h] [--names NAMES]
@@ -62,6 +62,19 @@ optional arguments:
                  *every* stackoverflow site
 ```
 
+### Proxy support 
+
+If you need to pass through a proxy, you can configure an `.env` file and add as follow: 
+
+```
+HTTP_PROXY=http://proxy:port
+http_proxy=http://proxy:port
+HTTPS_PROXY=http://proxy:port
+https_proxy=http://proxy:port
+NO_PROXY=address to ignore,localhost
+no_proxy=address to ignore,localhost
+```
+
 # TODO:
 
 - [ ] should we add metadata to the text (i.e name of stackexchange & tags)?

From 096b4823cf25befa37d2db8d25beda51331200f9 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 24 Sep 2023 22:41:19 +0900
Subject: [PATCH 04/15] add docu for --list

---
 README.md | 13 ++++++++++++-
 pairer.py |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6bd94b1..72fe7b7 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,18 @@ python3 main.py --names ru.stackoverflow,money.stackexchange
 
 The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange).
 
-### All Usage Options:
+## List available sources in Stack Exchange
+
+this will list all the available sources: 
+
+```
+python3 main.py --list
+```
+
+They will be listed as list, which could be parsed with `grep` and other batch utilities.
+
+
+## All Usage Options:
 
 ```
 usage: main.py [-h] [--names NAMES]
diff --git a/pairer.py b/pairer.py
index 880bee7..e561b5d 100644
--- a/pairer.py
+++ b/pairer.py
@@ -29,7 +29,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo
             self.ar = archiver
 
     def main(self):
-        """iterates through SE xmls and:
+        """iterates through SE XMLs and:
 
         - stores PostTypeId="1" with AcceptedAnswerIds / Answers.
         - when an AcceptedAnswerId or Answer > min_score is reached, it should:

From 82605edb215a9403f43662f099b5d71b61f62010 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 24 Sep 2023 19:58:30 +0900
Subject: [PATCH 05/15] uniform archiver use

---
 main.py   | 65 +++++++++++++++++++++++++++++++++++--------------------
 pairer.py | 43 ++++++++++++++++++++----------------
 2 files changed, 65 insertions(+), 43 deletions(-)

diff --git a/main.py b/main.py
index 131653d..44fd2a0 100644
--- a/main.py
+++ b/main.py
@@ -1,12 +1,12 @@
 import argparse
 import os
+import sys
 import traceback
-import zipfile
 from itertools import repeat
 from multiprocessing import Pool, cpu_count
 
 import dotenv
-from lm_dataformat import Archive
+from lm_dataformat import Archive, JSONArchive, TextArchive, LM_DATAFORMAT_FORMAT, TEXT_FORMAT, SUPPORTED_FORMATS
 
 from downloader import Stack_Exchange_Downloader
 from pairer import QA_Pairer
@@ -23,38 +23,45 @@ def download_and_process_single(name, out_format, min_score, max_responses):
             similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys()))
             print("StackExchange source not found. Perhaps you meant", similar_entries)
             return
+
         path_to_xml = "dumps/{}/Posts.xml".format(name)
         if name != "stackoverflow":
             path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
         else:
             path_to_7z = "dumps/stackoverflow.com-Posts.7z"
+
         out_folder = "out".format(name)
         os.makedirs(out_folder, exist_ok=True)
         if not os.path.isfile(path_to_7z):
             # download 7z if it's not downloaded already
             s.download()
+
         if not os.path.isfile(path_to_xml):
             # extract 7z if it's not extracted already
             s.extract()
-        if out_format == "lm_dataformat":
+
+        if out_format == LM_DATAFORMAT_FORMAT:
             archiver = Archive(out_folder)
-        elif out_format == "zip":
-            archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
+        elif out_format == TEXT_FORMAT:
+            archiver = TextArchive(out_format)
+        elif out_format == "json":
+            archiver = JSONArchive(out_folder)
         else:
             archiver = None
+
         qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
-        qa.main()
-        if out_format == "lm_dataformat":
+        qa.process()
+        if out_format == LM_DATAFORMAT_FORMAT:
             archiver.commit(name)
-        elif out_format == "zip":
-            archiver.close()
-        try:
-            os.remove(path_to_7z)
-        except FileNotFoundError:
-            print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
-        filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
-        for f in filelist:
-            os.remove(os.path.join("dumps/{}".format(name), f))
+        else:
+            archiver.commit(name)
+        # try:
+        #     os.remove(path_to_7z)
+        # except FileNotFoundError:
+        #     print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
+        # filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
+        # for f in filelist:
+        #     os.remove(os.path.join("dumps/{}".format(name), f))
     except:
         traceback.print_exc()
 
@@ -63,7 +70,7 @@ def main(args):
     if args.list:
         s = Stack_Exchange_Downloader("all")
         print("List of all the sources of StackExchange: ")
-        print("- "+"\n- ".join(sorted(s.sites.keys())))
+        print("- " + "\n- ".join(sorted(s.sites.keys())))
         return
 
     names = args.names.split(',')
@@ -75,12 +82,16 @@ def main(args):
         # bring stackoverflow to the front so it is always processed first, since it's the largest
         if "stackoverflow" in names:
             names.insert(0, names.pop(names.index("stackoverflow")))
+        if args.no_zip:
+            print("Downloading everything required the output to be compressed. Re-run *without* the option --no-zip.")
+            sys.exit(-1)
     print('Downloading and processing stackexchange dumps for {}'.format(names))
     # Download & Process
     # init pool with as many CPUs as available
     cpu_no = cpu_count() - 1
     p = Pool(cpu_no)
-    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)))
+    p.starmap(download_and_process_single,
+              zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)))
 
 
 if __name__ == "__main__":
@@ -97,14 +108,20 @@ def main(args):
                         type=str)
     parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be '
                                              'lm_dataformat, as you will run into number of files per directory limits.',
-                        default="zip",
+                        default=TEXT_FORMAT,
+                        choices=SUPPORTED_FORMATS,
                         type=str)
-    parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.',
+    parser.add_argument('--no-zip',
+                        help="Disable the compression of the output files. Writing plain files might end up in problems with the filesystem",
+                        action="store_false",
+                        required=False,
+                        default=True)
+    parser.add_argument('--min_score',
+                        help='minimum score of a response in order to be included in the dataset. Default 3.',
                         type=int, default=3)
-    parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
-                                                'Default 3.', type=int, default=3)
+    parser.add_argument('--max_responses',
+                        help='maximum number of responses (sorted by score) to include for each question. '
+                             'Default 3.', type=int, default=3)
     args = parser.parse_args()
 
     main(args)
-
-
diff --git a/pairer.py b/pairer.py
index e561b5d..9599056 100644
--- a/pairer.py
+++ b/pairer.py
@@ -2,13 +2,15 @@
 import xml.etree.ElementTree as etree
 from collections import defaultdict
 from bs4 import BeautifulSoup
+from lm_dataformat import SUPPORTED_FORMATS, LM_DATAFORMAT_FORMAT, JSON_FORMAT, TEXT_FORMAT
 from tqdm import tqdm
-from utils import *
 
+from utils import *
 
 class QA_Pairer():
 
-    def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None):
+    def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format=TEXT_FORMAT,
+                 archiver=None):
         """Makes a text dataset from StackExchange dumps"""
         self.xml_path = xml_path
         if name is None:
@@ -22,13 +24,13 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo
         # min_score required to parse an answer
         self.min_score = min_score
         self.max_responses = max_responses
-        assert out_format in ["txt", "lm_dataformat", "zip"], "Out format not recognized"
+        assert out_format in SUPPORTED_FORMATS, "Out format not recognized"
         self.out_format = out_format
-        if out_format in ["lm_dataformat", "zip"]:
+        if out_format in SUPPORTED_FORMATS:
             assert archiver is not None
             self.ar = archiver
 
-    def main(self):
+    def process(self):
         """iterates through SE XMLs and:
 
         - stores PostTypeId="1" with AcceptedAnswerIds / Answers.
@@ -40,7 +42,8 @@ def main(self):
 
         """
         os.makedirs(self.out_folder, exist_ok=True)
-        for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name)):
+        for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)),
+                                desc="Parsing {} XML file".format(self.name)):
             if elem.tag == "row":
                 try:
                     attribs = defaultdict(lambda: None, elem.attrib)
@@ -94,7 +97,8 @@ def add_answer(self, a_attribs):
                 if a_attribs["Id"] is not None:
                     parent = self.questions[a_attribs["ParentId"]]
                     if parent is not None:
-                        self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer")
+                        self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs,
+                                                                                                         "answer")
                         self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
                 else:
                     self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
@@ -124,25 +128,26 @@ def check_complete(self, a_attribs):
                             key_score_dict = {}
                             for k, a in parent["Answers"].items():
                                 key_score_dict[k] = int(a["Score"])
-                            key_score_dict = {k: v for k, v in sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)}
+                            key_score_dict = {k: v for k, v in
+                                              sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)}
                             count = 0
                             for k in key_score_dict:
                                 if count >= self.max_responses:
                                     break
-                                out_str += 'A:\n\n{}\n\n'.format(BeautifulSoup(parent["Answers"][k]["Body"], "html.parser").get_text())
+                                out_str += 'A:\n\n{}\n\n'.format(
+                                    BeautifulSoup(parent["Answers"][k]["Body"], "html.parser").get_text())
                                 count += 1
-                        if self.out_format == "txt":
-                            with open("{}/{}".format(self.out_folder, out_name), 'w') as f:
-                                try:
-                                    f.write(filter_newlines(out_str))
-                                except:
-                                    f.write(filter_newlines(handle_unicode_errors(out_str)))
-                        elif self.out_format == "zip":
+                        if self.out_format == TEXT_FORMAT:
+                            try:
+                                self.ar.add_data(filter_newlines(out_str))
+                            except:
+                                self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)))
+                        if self.out_format == JSON_FORMAT:
                             try:
-                                self.ar.writestr(out_name, filter_newlines(out_str))
+                                self.ar.add_data(filter_newlines(out_str))
                             except:
-                                self.ar.writestr(out_name, filter_newlines(handle_unicode_errors(out_str)))
-                        elif self.out_format == "lm_dataformat":
+                                self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)))
+                        elif self.out_format == LM_DATAFORMAT_FORMAT:
                             try:
                                 self.ar.add_data(filter_newlines(out_str), meta={
                                     'name': out_name})

From 39064963efcd5ce5b5ff13c405cb2c9ea7ab047a Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 28 Sep 2023 21:24:40 +0900
Subject: [PATCH 06/15] move the formatting in the commit part. Use a
 dict-based structure to hold the data model.

---
 pairer.py | 56 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/pairer.py b/pairer.py
index 9599056..1b2c4c5 100644
--- a/pairer.py
+++ b/pairer.py
@@ -1,12 +1,14 @@
 import traceback
 import xml.etree.ElementTree as etree
 from collections import defaultdict
+
 from bs4 import BeautifulSoup
 from lm_dataformat import SUPPORTED_FORMATS, LM_DATAFORMAT_FORMAT, JSON_FORMAT, TEXT_FORMAT
 from tqdm import tqdm
 
 from utils import *
 
+
 class QA_Pairer():
 
     def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format=TEXT_FORMAT,
@@ -111,6 +113,13 @@ def check_complete(self, a_attribs):
         removes from dict and prints to file.
         """
         keys_to_del = []
+        qa_structure = {
+            "question": {
+                "title": "",
+                "body": ""
+            },
+            "answers": []
+        }
         parent = self.questions[a_attribs["ParentId"]]
         if a_attribs is not None and parent is not None:
             if parent["AnswerCount"] is not None and parent["ParsedAnswers"] is not None:
@@ -118,41 +127,28 @@ def check_complete(self, a_attribs):
                     keys_to_del.append(a_attribs["ParentId"])
                     if parent["Answers"] is not None and len(parent["Answers"]) > 0:
                         out_name = "{}_{}.txt".format(self.name, parent["Id"].zfill(10))
-                        out_str = ""
-                        out_str += 'Q:\n\n'
+                        question_structure = qa_structure['question']
                         if parent["Title"] is not None:
-                            out_str += '{}\n\n'.format(BeautifulSoup(parent["Title"], "html.parser").get_text())
+                            question_structure['title'] = BeautifulSoup(parent["Title"], "html.parser").get_text()
                         if parent["Body"] is not None:
-                            out_str += '{}\n\n'.format(BeautifulSoup(parent["Body"], "html.parser").get_text())
+                            question_structure['body'] = BeautifulSoup(parent["Body"], "html.parser").get_text()
                         if parent["Answers"] is not None:
                             key_score_dict = {}
+                            answers_structure_tmp = []
                             for k, a in parent["Answers"].items():
-                                key_score_dict[k] = int(a["Score"])
-                            key_score_dict = {k: v for k, v in
-                                              sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)}
-                            count = 0
-                            for k in key_score_dict:
-                                if count >= self.max_responses:
-                                    break
-                                out_str += 'A:\n\n{}\n\n'.format(
-                                    BeautifulSoup(parent["Answers"][k]["Body"], "html.parser").get_text())
-                                count += 1
-                        if self.out_format == TEXT_FORMAT:
-                            try:
-                                self.ar.add_data(filter_newlines(out_str))
-                            except:
-                                self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)))
-                        if self.out_format == JSON_FORMAT:
-                            try:
-                                self.ar.add_data(filter_newlines(out_str))
-                            except:
-                                self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)))
+                                # key_score_dict[k] = int(a["Score"])
+                                answers_structure_tmp.append({
+                                    "id": a['Id'],
+                                    "body": BeautifulSoup(a["Body"], "html.parser").get_text(),
+                                    "score": int(a["Score"])
+                                })
+                            qa_structure['answers'] = sorted(answers_structure_tmp, key=lambda item: item['score'],
+                                                             reverse=True)[0:self.max_responses]
+
+                        if self.out_format == TEXT_FORMAT or self.out_format == JSON_FORMAT:
+                            self.ar.add_data(qa_structure)
                         elif self.out_format == LM_DATAFORMAT_FORMAT:
-                            try:
-                                self.ar.add_data(filter_newlines(out_str), meta={
-                                    'name': out_name})
-                            except:
-                                self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)), meta={
-                                    'name': out_name})
+                            self.ar.add_data(qa_structure, meta={'name': out_name})
+
         for key in keys_to_del:
             self.questions.pop(key, None)

From 69d69b0658fc708005a4c214eaa53b508e63db32 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 29 Sep 2023 09:00:33 +0900
Subject: [PATCH 07/15] add proper JSON output with structured information

---
 main.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/main.py b/main.py
index 44fd2a0..7637409 100644
--- a/main.py
+++ b/main.py
@@ -30,7 +30,7 @@ def download_and_process_single(name, out_format, min_score, max_responses):
         else:
             path_to_7z = "dumps/stackoverflow.com-Posts.7z"
 
-        out_folder = "out".format(name)
+        out_folder = "out/{}".format(name)
         os.makedirs(out_folder, exist_ok=True)
         if not os.path.isfile(path_to_7z):
             # download 7z if it's not downloaded already
@@ -43,7 +43,7 @@ def download_and_process_single(name, out_format, min_score, max_responses):
         if out_format == LM_DATAFORMAT_FORMAT:
             archiver = Archive(out_folder)
         elif out_format == TEXT_FORMAT:
-            archiver = TextArchive(out_format)
+            archiver = TextArchive(out_folder)
         elif out_format == "json":
             archiver = JSONArchive(out_folder)
         else:
@@ -55,13 +55,17 @@ def download_and_process_single(name, out_format, min_score, max_responses):
             archiver.commit(name)
         else:
             archiver.commit(name)
-        # try:
-        #     os.remove(path_to_7z)
-        # except FileNotFoundError:
-        #     print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
-        # filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
-        # for f in filelist:
-        #     os.remove(os.path.join("dumps/{}".format(name), f))
+
+        try:
+            os.remove(path_to_7z)
+        except FileNotFoundError:
+            print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
+        directory_uncompressed = "dumps/{}".format(name)
+        filelist = [f for f in os.listdir(directory_uncompressed)
+                    if f.endswith(".xml")]
+        for f in filelist:
+            os.remove(os.path.join(directory_uncompressed, f))
+        os.removedirs(directory_uncompressed)
     except:
         traceback.print_exc()
 
@@ -82,9 +86,9 @@ def main(args):
         # bring stackoverflow to the front so it is always processed first, since it's the largest
         if "stackoverflow" in names:
             names.insert(0, names.pop(names.index("stackoverflow")))
-        if args.no_zip:
-            print("Downloading everything required the output to be compressed. Re-run *without* the option --no-zip.")
-            sys.exit(-1)
+        # if args.no_zip:
+        #     print("Downloading everything required the output to be compressed. Re-run *without* the option --no-zip.")
+        #     sys.exit(-1)
     print('Downloading and processing stackexchange dumps for {}'.format(names))
     # Download & Process
     # init pool with as many CPUs as available
@@ -111,11 +115,11 @@ def main(args):
                         default=TEXT_FORMAT,
                         choices=SUPPORTED_FORMATS,
                         type=str)
-    parser.add_argument('--no-zip',
-                        help="Disable the compression of the output files. Writing plain files might end up in problems with the filesystem",
-                        action="store_false",
-                        required=False,
-                        default=True)
+    # parser.add_argument('--no-zip',
+    #                     help="Disable the compression of the output files. Writing plain files might end up in problems with the filesystem",
+    #                     action="store_true",
+    #                     required=False,
+    #                     default=False)
     parser.add_argument('--min_score',
                         help='minimum score of a response in order to be included in the dataset. Default 3.',
                         type=int, default=3)

From aab78085b629104453a88920c29d9fb4904a95fe Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 29 Sep 2023 11:05:24 +0900
Subject: [PATCH 08/15] small improvements

---
 main.py   | 5 +----
 pairer.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 7637409..fcf510b 100644
--- a/main.py
+++ b/main.py
@@ -51,10 +51,7 @@ def download_and_process_single(name, out_format, min_score, max_responses):
 
         qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
         qa.process()
-        if out_format == LM_DATAFORMAT_FORMAT:
-            archiver.commit(name)
-        else:
-            archiver.commit(name)
+        archiver.commit(name)
 
         try:
             os.remove(path_to_7z)
diff --git a/pairer.py b/pairer.py
index 1b2c4c5..800762f 100644
--- a/pairer.py
+++ b/pairer.py
@@ -129,7 +129,7 @@ def check_complete(self, a_attribs):
                         out_name = "{}_{}.txt".format(self.name, parent["Id"].zfill(10))
                         question_structure = qa_structure['question']
                         if parent["Title"] is not None:
-                            question_structure['title'] = BeautifulSoup(parent["Title"], "html.parser").get_text()
+                            question_structure['title'] = parent["Title"]
                         if parent["Body"] is not None:
                             question_structure['body'] = BeautifulSoup(parent["Body"], "html.parser").get_text()
                         if parent["Answers"] is not None:

From 40e96229efdd59c639063d06645257d41a8fade4 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 29 Sep 2023 11:05:39 +0900
Subject: [PATCH 09/15] add --keep-source for keeping the original 7z

---
 main.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/main.py b/main.py
index fcf510b..9ac17b0 100644
--- a/main.py
+++ b/main.py
@@ -14,7 +14,7 @@
 dotenv.load_dotenv(override=True)
 
 
-def download_and_process_single(name, out_format, min_score, max_responses):
+def download_and_process_single(name, out_format, min_score, max_responses, keep_sources=False):
     try:
         name = name.strip().lower()
         os.makedirs("dumps", exist_ok=True)
@@ -49,14 +49,17 @@ def download_and_process_single(name, out_format, min_score, max_responses):
         else:
             archiver = None
 
-        qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
+        qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score,
+                       max_responses=max_responses)
         qa.process()
         archiver.commit(name)
 
-        try:
-            os.remove(path_to_7z)
-        except FileNotFoundError:
-            print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
+        if not keep_sources:
+            try:
+                os.remove(path_to_7z)
+            except FileNotFoundError:
+                print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
+
         directory_uncompressed = "dumps/{}".format(name)
         filelist = [f for f in os.listdir(directory_uncompressed)
                     if f.endswith(".xml")]
@@ -92,7 +95,8 @@ def main(args):
     cpu_no = cpu_count() - 1
     p = Pool(cpu_no)
     p.starmap(download_and_process_single,
-              zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)))
+              zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses),
+                  repeat(args.keep_sources)))
 
 
 if __name__ == "__main__":
@@ -107,8 +111,9 @@ def main(args):
                                         'If "all", will download, extract & parse *every* stackoverflow site',
                         default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
                         type=str)
-    parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be '
-                                             'lm_dataformat, as you will run into number of files per directory limits.',
+    parser.add_argument('--out_format',
+                        help='format of out file - if you are processing everything this will need to be '
+                             'lm_dataformat, as you will run into number of files per directory limits.',
                         default=TEXT_FORMAT,
                         choices=SUPPORTED_FORMATS,
                         type=str)
@@ -123,6 +128,9 @@ def main(args):
     parser.add_argument('--max_responses',
                         help='maximum number of responses (sorted by score) to include for each question. '
                              'Default 3.', type=int, default=3)
+    parser.add_argument('--keep-sources',
+                        help='Do not clean-up the downloaded source 7z files.',
+                        action="store_true", default=False)
     args = parser.parse_args()
 
     main(args)

From 2587fd16ec52962c809a92cb8ae3b3878be74ec0 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 29 Sep 2023 11:23:17 +0900
Subject: [PATCH 10/15] cosmetics

---
 main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index 9ac17b0..a5b1c9a 100644
--- a/main.py
+++ b/main.py
@@ -1,12 +1,12 @@
 import argparse
 import os
-import sys
 import traceback
 from itertools import repeat
 from multiprocessing import Pool, cpu_count
 
 import dotenv
-from lm_dataformat import Archive, JSONArchive, TextArchive, LM_DATAFORMAT_FORMAT, TEXT_FORMAT, SUPPORTED_FORMATS
+from lm_dataformat import Archive, JSONArchive, TextArchive, LM_DATAFORMAT_FORMAT, TEXT_FORMAT, SUPPORTED_FORMATS, \
+    JSON_FORMAT
 
 from downloader import Stack_Exchange_Downloader
 from pairer import QA_Pairer
@@ -44,7 +44,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, keep
             archiver = Archive(out_folder)
         elif out_format == TEXT_FORMAT:
             archiver = TextArchive(out_folder)
-        elif out_format == "json":
+        elif out_format == JSON_FORMAT:
             archiver = JSONArchive(out_folder)
         else:
             archiver = None

From 1a24a8908009bec22a5ad3e5e77efc8522bb37ce Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 29 Sep 2023 11:31:10 +0900
Subject: [PATCH 11/15] cleanup

---
 pairer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pairer.py b/pairer.py
index 800762f..838e319 100644
--- a/pairer.py
+++ b/pairer.py
@@ -3,7 +3,7 @@
 from collections import defaultdict
 
 from bs4 import BeautifulSoup
-from lm_dataformat import SUPPORTED_FORMATS, LM_DATAFORMAT_FORMAT, JSON_FORMAT, TEXT_FORMAT
+from lm_dataformat import SUPPORTED_FORMATS, LM_DATAFORMAT_FORMAT, JSON_FORMAT, TEXT_FORMAT, TextArchive
 from tqdm import tqdm
 
 from utils import *
@@ -145,10 +145,12 @@ def check_complete(self, a_attribs):
                             qa_structure['answers'] = sorted(answers_structure_tmp, key=lambda item: item['score'],
                                                              reverse=True)[0:self.max_responses]
 
-                        if self.out_format == TEXT_FORMAT or self.out_format == JSON_FORMAT:
+                        if self.out_format == TEXT_FORMAT:
+                            self.ar.add_data(TextArchive.to_text(qa_structure))
+                        elif self.out_format == JSON_FORMAT:
                             self.ar.add_data(qa_structure)
                         elif self.out_format == LM_DATAFORMAT_FORMAT:
-                            self.ar.add_data(qa_structure, meta={'name': out_name})
+                            self.ar.add_data(TextArchive.to_text(qa_structure), meta={'name': out_name})
 
         for key in keys_to_del:
             self.questions.pop(key, None)

From 6be5af54eb082ca0da3c8a36ab63a795f11f09af Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 5 Nov 2023 09:03:14 +0900
Subject: [PATCH 12/15] test dependency to updated lm_dataformat

---
 .github/workflows/ci-build.yml | 35 ++++++++++++++++++++++++++++++++++
 requirements.txt               |  3 ++-
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci-build.yml

diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
new file mode 100644
index 0000000..1376460
--- /dev/null
+++ b/.github/workflows/ci-build.yml
@@ -0,0 +1,35 @@
+name: Build unstable
+
+on: [push]
+
+concurrency: 
+  group: unstable
+#  cancel-in-progress: true
+
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.9"
+    - name: Cleanup more disk space
+      run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install --upgrade flake8 pytest pycodestyle
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+#    - name: Test with pytest
+#      run: |
+#        python -m pytest --rootdir .
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 9d3cf3e..fb4e6b3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,6 @@ bs4
 lxml
 py7zr
 tqdm
-lm-dataformat
+# lm-dataformat
+-e https://github.com/lfoppiano/lm_dataformat
 jsonlines
\ No newline at end of file

From 6923461d7b2ed907950a471c5d818bd3b2998f6d Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 5 Nov 2023 09:11:17 +0900
Subject: [PATCH 13/15] fix format

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index fb4e6b3..c451745 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,5 +3,5 @@ lxml
 py7zr
 tqdm
 # lm-dataformat
--e https://github.com/lfoppiano/lm_dataformat
+-e git+https://github.com/lfoppiano/lm_dataformat
 jsonlines
\ No newline at end of file

From a64b36f4ae27e547196a5aa6922efe8129c532fd Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 5 Nov 2023 09:12:53 +0900
Subject: [PATCH 14/15] fix format

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c451745..b5442ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,5 +3,5 @@ lxml
 py7zr
 tqdm
 # lm-dataformat
--e git+https://github.com/lfoppiano/lm_dataformat
+-e git+https://github.com/lfoppiano/lm_dataformat.git
 jsonlines
\ No newline at end of file

From adf3d8ab26e939f6bdc22a2be98aa66d3fb739f9 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 5 Nov 2023 09:15:03 +0900
Subject: [PATCH 15/15] fix format

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b5442ff..cfb9ed1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,5 +3,5 @@ lxml
 py7zr
 tqdm
 # lm-dataformat
--e git+https://github.com/lfoppiano/lm_dataformat.git
+-e git+https://github.com/lfoppiano/lm_dataformat.git#egg=lm_dataformat
 jsonlines
\ No newline at end of file