python_next_token/convert.py at main · riya-dulepet/python_next_token · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import glob, json, os, argparse
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import Pool, cpu_count


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Params')
    parser.add_argument('--segment_len', type=int, default=254,
                        help='the length of each example')
    # we set this to be 254 instead of 256 because we want the input to be like: <control_code> input_ids <eos>
    parser.add_argument('--stride', type=int, default=10,
                        help='stride to split training examples')
    parser.add_argument('--dev_size', type=float, default=0.1,
                        help='split ratio of development set for each language')
    args = parser.parse_args()

    gpt2_tok = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
    segments = {}


    dataset = load_dataset("ArtifactAI/arxiv_python_research_code", split='train[:50%]')


    def process_batch(batch):
    # Initialize the tokenizer inside the worker function to ensure each process has its own instance
        results = []
        for example in batch:
            code_content = example
            encoded = gpt2_tok.encode(code_content, max_length=1024, truncation=True)
            for i in range(0, len(encoded), args.stride):
                seg = encoded[i:i + args.segment_len]
                results.append(json.dumps({"token_ids": seg, "label": "Python"}))
        return results


    num_processes = min(48, cpu_count())

    # Define batch size based on your dataset size and memory considerations
    batch_size = len(dataset) // (10 * num_processes)  # Adjust based on dataset size and available memory

    #batch_size = 1000  # Adjust based on your system's memory capacity and CPU power
    batches = [dataset[i:i + batch_size]['code'] for i in range(0, len(dataset), batch_size)]

    # Process batches in parallel
    with Pool(processes=num_processes) as pool:
        results = pool.map(process_batch, batches)

    segments = {"Python": [item for sublist in results for item in sublist]}


    train, dev = [], []
    for key in segments:
        # we don't shuffle before splitting because we want the train and dev to be very different (less overlapping)
        tr, de = train_test_split(segments[key], test_size=args.dev_size)
        train += tr
        dev += de


    to_path = "dataset/source_code/json"
    if not os.path.isdir(to_path):
        os.makedirs(to_path)

    with open(os.path.join(to_path, "train.jsonl"), "w") as f:
        f.write("\n".join(train))

    with open(os.path.join(to_path, "dev.jsonl"), "w") as f:
        f.write("\n".join(dev))