diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2664379 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +dumps +out* +logs* +test* diff --git a/README.md b/README.md index 2f9cb51..2229079 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # stackexchange_dataset A python tool for downloading & processing the [stackexchange data dumps](https://archive.org/details/stackexchange) into a text dataset for Language Models. +The schema documentation of the dump can be found [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede/2678#2678) Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar) @@ -10,6 +11,7 @@ cd stackexchange_dataset pip install -r requirements.txt ``` # Usage +The default output format (for parsed data) is .zip. To create a lm_dataformat dataset pass option --out_format=lm_dataformat to the following commands. To download *every* stackexchange dump & parse to text, simply run diff --git a/downloader.py b/downloader.py index 228b18e..ba51683 100644 --- a/downloader.py +++ b/downloader.py @@ -3,7 +3,7 @@ from utils import * import py7zr - +curr_dir = os.path.dirname(__file__) class Stack_Exchange_Downloader(): def __init__(self, name): @@ -22,18 +22,18 @@ def parse_sitesmap(self, sitesmap): site_name = url.replace(".com", "").replace(".net", "") download_link = "https://archive.org/download/stackexchange/" + url + ".7z" if url == "stackoverflow.com": - download_link = "https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z" + download_link = "https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z" self.sites[site_name] = {"url" : url, "download" : download_link} def download(self): if self.name == "all": for k in self.sites: - command = "wget {} -P dumps".format(self.sites[k]["download"]) + command = "wget {} -P {}/dumps".format(curr_dir, self.sites[k]["download"]) print(command) if os.system(command): print('Download for {} failed!'.format(k)) else: - command = "wget {} -P dumps".format(self.sites[self.name]["download"]) + command = "wget {} -P {}/dumps".format(curr_dir, self.sites[self.name]["download"]) print(command) if os.system(command): print('Download for {} failed!'.format(self.name)) @@ -45,8 +45,7 @@ def extract(self): # , mode='r')) # archive.extractall() # archive.close() - command = "py7zr x dumps/{} dumps/{}".format(self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""), - k) + command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, k) print(command) if os.system(command): print('Extraction for {} failed!'.format(k)) @@ -56,8 +55,7 @@ def extract(self): # , mode='r')) # archive.extractall() # archive.close() - command = "py7zr x dumps/{} dumps/{}".format(self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""), - self.name) + command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, self.name) print(command) if os.system(command): print('Extraction for {} failed!'.format(self.name)) diff --git a/main.py b/main.py index 50a727a..658f44b 100644 --- a/main.py +++ b/main.py @@ -7,59 +7,71 @@ from itertools import repeat from lm_dataformat import Archive import zipfile +import os +import json - +curr_dir = os.path.dirname(__file__) def download_and_process_single(name, out_format, min_score, max_responses): try: name = name.strip().lower() - os.makedirs("dumps", exist_ok=True) + os.makedirs("{}/dumps".format(curr_dir), exist_ok=True) s = Stack_Exchange_Downloader(name) - path_to_xml = "dumps/{}/Posts.xml".format(name) + # *.7z files are downloaded from "https://archive.org/download/stackexchange/ if name != "stackoverflow": - path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"]) + path_to_7z = "{}/dumps/{}.7z".format(curr_dir,s.sites[name]["url"]) else: - path_to_7z = "dumps/stackoverflow.com-Posts.7z" - out_folder = "out".format(name) - os.makedirs(out_folder, exist_ok=True) + path_to_7z = "{}/dumps/Stackoverflow.com-Posts.7z".format(curr_dir) if not os.path.isfile(path_to_7z): # download 7z if it's not downloaded already s.download() + + # *.xml files are extracted from *.7z files using py7zr + path_to_xml = "{}/dumps/{}/Posts.xml".format(curr_dir, name) if not os.path.isfile(path_to_xml): # extract 7z if it's not extracted already s.extract() + + out_folder = "{}/out".format(curr_dir) + # out_folder = "{}/../../../suriyagwu/stackexchange/all".format(curr_dir) + os.makedirs(out_folder, exist_ok=True) + os.makedirs("{}/samples".format(out_folder), exist_ok=True) + os.makedirs("{}/misc".format(out_folder), exist_ok=True) if out_format == "lm_dataformat": archiver = Archive(out_folder) elif out_format == "zip": archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a') else: archiver = None - qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses) + qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses) qa.main() if out_format == "lm_dataformat": archiver.commit(name) elif out_format == "zip": archiver.close() - try: - os.remove(path_to_7z) - except FileNotFoundError: - print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"])) - filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")] - for f in filelist: - os.remove(os.path.join("dumps/{}".format(name), f)) + + # save qa.questions dictionary data to a file + json.dump(qa.questions, open("{}/misc/{}_unprocessed_questions.json".format(out_folder, name), "w"), indent=4) + + # try: + # os.remove(path_to_7z) + # except FileNotFoundError: + # print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"])) + # filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")] + # for f in filelist: + # os.remove(os.path.join("dumps/{}".format(name), f)) except: traceback.print_exc() def main(args): - names = args.names.split(',') + names = args.names.split(',') if names[0].strip().lower() == "all": s = Stack_Exchange_Downloader("all") names = [] for k in s.sites: names.append(k) - # bring stackoverflow to the front so it is always processed first, since it's the largest - if "stackoverflow" in names: - names.insert(0, names.pop(names.index("stackoverflow"))) + print('Removing stackoverflow from the list of sites to process. Process it separately.') + names.pop(names.index("stackoverflow")) print('Downloading and processing stackexchange dumps for {}'.format(names)) # Download & Process # init pool with as many CPUs as available @@ -70,20 +82,31 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser( - description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw ' - 'question-answer pair text dataset for Language Models') - parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. ' - 'If "all", will download, extract & parse *every* stackoverflow site', - default="3dprinting.stackexchange,3dprinting.meta.stackexchange", - type=str) - parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be ' - 'lm_dataformat, as you will run into number of files per directory limits.', - default="zip", - type=str) - parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.', - type=int, default=3) - parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. ' - 'Default 3.', type=int, default=3) + description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw question-answer pair text dataset for Language Models') + parser.add_argument( + '--names', + help='names of stackexchanges to download, extract & parse, separated by commas. If "all", will download, extract & parse *every* stackoverflow site', + default="3dprinting.stackexchange,3dprinting.meta.stackexchange", + type=str + ) + parser.add_argument( + '--out_format', + help='format of out file - if you are processing everything this will need to be lm_dataformat, as you will run into number of files per directory limits.', + default="zip", + type=str + ) + parser.add_argument( + '--min_score', + help='minimum score of a response in order to be included in the dataset. Default 3.', + type=int, + default=3 + ) + parser.add_argument( + '--max_responses', + help='maximum number of responses (sorted by score) to include for each question. Default 100.', + type=int, + default=100 + ) args = parser.parse_args() main(args) diff --git a/main_filter.py b/main_filter.py new file mode 100644 index 0000000..d4f7e0e --- /dev/null +++ b/main_filter.py @@ -0,0 +1,45 @@ +''' +Filter the stackoverflow dataset generated by main.py in lm_dataformat +Edit filter_logic to implement filtering of any meta dict sample +''' +from lm_dataformat import Reader, Archive +import os +import glob + +curr_dir = os.path.dirname(__file__) +out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir) +full_data_file = "{}/data_*stackoverflow.jsonl.zst".format(out_folder) + +class FilterMeta: + # currently implementing or of filter tags + def __init__(self): + self.filter_tags = ['python'] + self.file_suffix = "_"+"_".join(self.filter_tags) + + def filter_logic(self,meta): + for tag in self.filter_tags: + if tag in meta['tags']: + return True + return False + +new_archive = Archive(out_folder) +filter_meta = FilterMeta() +num_accepted = 0 +num_rejected = 0 +for filename in glob.glob(full_data_file): + # create a reader object for the file + print("Filtering {}".format(filename)) + rdr = Reader(filename) + # iterate over the samples in the file + for doc, meta in rdr.stream_data(get_meta=True): + if filter_meta.filter_logic(meta): + # add the sample to the new archive + new_archive.add_data(doc, meta) + num_accepted += 1 + else: + num_rejected += 1 + +filtered_data_file_name = "stackoverflow"+filter_meta.file_suffix +new_archive.commit(filtered_data_file_name) +print("##### Stats #####") +print(f"num_accepted: {num_accepted}, num_rejected: {num_rejected}") diff --git a/pairer.py b/pairer.py index 880bee7..227e501 100644 --- a/pairer.py +++ b/pairer.py @@ -1,6 +1,6 @@ import traceback import xml.etree.ElementTree as etree -from collections import defaultdict +from collections import defaultdict, OrderedDict from bs4 import BeautifulSoup from tqdm import tqdm from utils import * @@ -12,7 +12,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo """Makes a text dataset from StackExchange dumps""" self.xml_path = xml_path if name is None: - self.name = os.path.dirname(xml_path).replace("dumps/", "") + self.name = os.path.dirname(xml_path).replace("*dumps/", "") else: self.name = name # dict to save questions @@ -27,6 +27,13 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo if out_format in ["lm_dataformat", "zip"]: assert archiver is not None self.ar = archiver + self.sample = True + self.num_posts = 0 + self.num_nonQA_posts = 0 + self.num_discarded_answers = 0 + self.num_discarded_questions = 0 + self.num_questions = 0 + self.num_answers = 0 def main(self): """iterates through SE xmls and: @@ -39,27 +46,43 @@ def main(self): > Delete from memory """ - os.makedirs(self.out_folder, exist_ok=True) - for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name)): + os.makedirs(self.out_folder, exist_ok=True) + for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name), disable=True): if elem.tag == "row": try: attribs = defaultdict(lambda: None, elem.attrib) + self.num_posts += 1 + # checks if PostTypeId=1 if is_question(attribs): if has_answers(attribs): + # trim post data to ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId'] + # other potentially usuful keys: ['CreationDate', 'Score', 'ViewCount', 'OwnerUserId', 'LastActivityDate', 'CommentCount', 'ContentLicense'] trim_attribs(attribs, "question") self.questions[attribs["Id"]] = attribs else: # if the question has no answers, discard it + self.num_discarded_questions += 1 continue + # checks if PostTypeId=2 elif is_answer(attribs): # if is accepted answer, append answer Body to relevant questions "AcceptedAnswer" field - # if the answer's score > min_score - # append the answer to the relevant question's OtherAnswers dict + # if the answer's score > min_score append the answer to the relevant question's OtherAnswers dict self.add_answer(attribs) self.check_complete(attribs) + else: + self.num_nonQA_posts += 1 elem.clear() except: traceback.print_exc() + + print("##### Stats #####") + print(f"num_questions={self.num_questions}, num_discarded_questions={self.num_discarded_questions}") + print(f"num_answers={self.num_answers}, num_discarded_answers={self.num_discarded_answers}") + print(f"num_posts={self.num_posts}, num_nonQA_posts={self.num_nonQA_posts}") + unprocessed_questions = len(self.questions.items()) + unprocessed_answers = sum([len(q_att['Answers'].items()) for q, q_att in self.questions.items()]) + print(f"unprocessed_questions={unprocessed_questions}, unprocessed_answers={unprocessed_answers}") + print("###### End ######") def is_above_threshold(self, a_attribs): """ @@ -77,29 +100,29 @@ def is_above_threshold(self, a_attribs): def add_answer(self, a_attribs): """ Adds answer to its parent question in self.questions if it's either an accepted answer or above self.min_score. - If answer is an accepted answer, it gets appended to the AcceptedAnswer field, otherwise it gets appended to - OtherAnswers. + If answer is an accepted answer, it gets appended to the AcceptedAnswer field, otherwise it gets appended to OtherAnswers. - Also increments the question's 'ParsedAnswers' field. When ParsedAnswers = AnswerCount, the question is deleted - from memory and saved to a text file. + Also increments the question's 'ParsedAnswers' field. When ParsedAnswers = AnswerCount, the question is deleted from memory and saved to a text file. :param a_attribs: Answer's attribute dict """ assert is_answer(a_attribs), "Must be an answer to add to parent" - if a_attribs is not None and self.questions[a_attribs["ParentId"]] is not None: + if self.questions.get(a_attribs["ParentId"], None) is not None: if is_accepted_answer(a_attribs, self.questions[a_attribs["ParentId"]]): self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 elif self.is_above_threshold(a_attribs): - if a_attribs["Id"] is not None: - parent = self.questions[a_attribs["ParentId"]] - if parent is not None: - self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") - self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 - else: - self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 - else: + self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") + self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 + else: + self.num_discarded_answers += 1 + # print("Discarded answer with score {}".format(a_attribs["Score"]), self.num_discarded_answers) + self.questions[a_attribs["ParentId"]]["NonAnswers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 + else: + parentid = a_attribs["ParentId"] + self.num_discarded_answers += 1 + # print(f"ParentId {parentid} not found", self.num_discarded_answers) def check_complete(self, a_attribs): """ @@ -107,13 +130,27 @@ def check_complete(self, a_attribs): removes from dict and prints to file. """ keys_to_del = [] - parent = self.questions[a_attribs["ParentId"]] + parent = self.questions.get(a_attribs["ParentId"], None) if a_attribs is not None and parent is not None: - if parent["AnswerCount"] is not None and parent["ParsedAnswers"] is not None: + if parent["AnswerCount"] and parent["ParsedAnswers"]: if int(parent["ParsedAnswers"]) == int(parent['AnswerCount']): keys_to_del.append(a_attribs["ParentId"]) - if parent["Answers"] is not None and len(parent["Answers"]) > 0: - out_name = "{}_{}.txt".format(self.name, parent["Id"].zfill(10)) + ## Filter change: still use quesions with no accepted answers + # if parent["Answers"] is not None and len(parent["Answers"]) > 0: + if 1: + out_tags = tags_as_list(parent["Tags"]) + out_name = "{}_{}_{}.txt".format(self.name, parent["Id"].zfill(10),"_".join(out_tags)) + out_dict = dict( + name=out_name, + tags=out_tags, + title=BeautifulSoup(parent["Title"], "html.parser").get_text(), + question=BeautifulSoup(parent["Body"], "html.parser").get_text(), + answers=[], + answers_scores=[], + non_answers=[], + non_answers_scores=[] + ) + # fmt: "Q:\n\n{question.title}\n\n{question.body}\n\nA:\n\n{answer.body\n\n for answer.sortby(score)}" out_str = "" out_str += 'Q:\n\n' if parent["Title"] is not None: @@ -122,15 +159,29 @@ def check_complete(self, a_attribs): out_str += '{}\n\n'.format(BeautifulSoup(parent["Body"], "html.parser").get_text()) if parent["Answers"] is not None: key_score_dict = {} - for k, a in parent["Answers"].items(): - key_score_dict[k] = int(a["Score"]) - key_score_dict = {k: v for k, v in sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)} + for ans_id, ans_attrib in parent["Answers"].items(): + key_score_dict[ans_id] = int(ans_attrib["Score"]) + key_score_dict = OrderedDict((ans_id, score) for ans_id, score in sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)) count = 0 - for k in key_score_dict: + for ans_id, score in key_score_dict.items(): if count >= self.max_responses: break - out_str += 'A:\n\n{}\n\n'.format(BeautifulSoup(parent["Answers"][k]["Body"], "html.parser").get_text()) + ans_text = BeautifulSoup(parent["Answers"][ans_id]["Body"], "html.parser").get_text() + out_str += 'A:\n\n{}\n\n'.format(ans_text) + out_dict['answers'].append(ans_text) + out_dict['answers_scores'].append(score) count += 1 + + if parent["NonAnswers"] is not None: + nona_key_score_dict = {} + for ans_id, ans_attrib in parent["NonAnswers"].items(): + nona_key_score_dict[ans_id] = int(ans_attrib["Score"]) + nona_key_score_dict = OrderedDict((ans_id, score) for ans_id, score in sorted(nona_key_score_dict.items(), key=lambda item: item[1], reverse=True)) + for ans_id, score in nona_key_score_dict.items(): + ans_text = BeautifulSoup(parent["NonAnswers"][ans_id]["Body"], "html.parser").get_text() + out_dict['non_answers'].append(ans_text) + out_dict['non_answers_scores'].append(score) + if self.out_format == "txt": with open("{}/{}".format(self.out_folder, out_name), 'w') as f: try: @@ -144,10 +195,29 @@ def check_complete(self, a_attribs): self.ar.writestr(out_name, filter_newlines(handle_unicode_errors(out_str))) elif self.out_format == "lm_dataformat": try: - self.ar.add_data(filter_newlines(out_str), meta={ - 'name': out_name}) + self.ar.add_data( + filter_newlines(out_str), + meta=out_dict + ) except: - self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)), meta={ - 'name': out_name}) + self.ar.add_data( + filter_newlines(handle_unicode_errors(out_str)), + meta=out_dict + ) + if self.sample and len(out_dict['answers'])>=3: + with open("{}/samples/sample_{}".format(self.out_folder, out_name), 'w') as f: + try: + f.write(filter_newlines(out_str)) + except: + f.write(filter_newlines(handle_unicode_errors(out_str))) + self.sample = False + self.num_questions += 1 + self.num_answers += len(out_dict['answers']) + if len(key_score_dict)-len(out_dict['answers'])>0: + self.num_discarded_answers += len(key_score_dict)-len(out_dict['answers']) # cases where number of answers is > max responses + print(f"Discarding {len(key_score_dict)-len(out_dict['answers'])} of {len(key_score_dict)} answers", self.num_discarded_answers) + else: + # discard questions with no accepted answers + self.num_discarded_questions += 1 for key in keys_to_del: self.questions.pop(key, None) diff --git a/requirements.txt b/requirements.txt index 9d3cf3e..3833202 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ lxml py7zr tqdm lm-dataformat -jsonlines \ No newline at end of file +jsonlines +requests \ No newline at end of file diff --git a/utils.py b/utils.py index a6c3dc1..1587aae 100644 --- a/utils.py +++ b/utils.py @@ -44,6 +44,17 @@ def has_answers(elem_attribs): return True return False +def match_tags_or(elem_attribs, chk_tags): + assert is_question(elem_attribs), "Must be a question to match tags" + if not(len(chk_tags)): + return True + if elem_attribs["Tags"] is not None: + elem_tags = tags_as_list(elem_attribs["Tags"]) + for tag in chk_tags: + if tag in elem_tags: + return True + return False + def trim_attribs(elem_attribs, attrib_type="question"): """deletes non-useful data from attribs dict for questions / answers, returns remaining""" @@ -53,6 +64,9 @@ def trim_attribs(elem_attribs, attrib_type="question"): [elem_attribs.pop(x, None) for x in to_delete] elem_attribs["ParsedAnswers"] = 0 elem_attribs["Answers"] = {} + elem_attribs["NonAnswers"] = {} + if 'AnswerCount' not in elem_attribs.keys(): + elem_attribs['AnswerCount']=-1 elif attrib_type == "answer": to_keep = ['Id', 'Body', 'Score'] new_dict = {} @@ -61,3 +75,10 @@ def trim_attribs(elem_attribs, attrib_type="question"): return new_dict else: raise Exception('Unrecognized attribute type - please specify either question or answer') + +def tags_as_list(tag_str): + tag_str = tag_str.strip(">").strip("<") + tag_str = tag_str.replace("-","_") + tag_list = tag_str.split("><") + tag_list.sort() + return tag_list \ No newline at end of file