-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmpi_split_corpus.cpp
134 lines (121 loc) · 3.94 KB
/
mpi_split_corpus.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <stdint.h>
#include <stdlib.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
bool GetCorpusInfo(const char *file, uint32_t &total_len,
uint32_t &line_num, uint32_t &sent_num);
bool SplitCorpus(const char *file, int part_num,
uint32_t avg_len_per_part, uint32_t avg_len_per_sent);
int main(int argc, char *argv[]) {
if (3 != argc) {
std::cerr << "usage: " << argv[0] << " partition_number corpus_file\n";
return -1;
}
uint32_t total_len(0), line_num(0), sent_num(0);
if (!GetCorpusInfo(argv[2], total_len, line_num, sent_num)) {
std::cerr << "Get corpus [" << argv[2] << "] info failed!\n";
return -1;
}
int p_num = atoi(argv[1]);
int avg_len_per_part = total_len / p_num;
int avg_len_per_sent = total_len / sent_num;
if (!SplitCorpus(argv[2], p_num, avg_len_per_part, avg_len_per_sent)) {
std::cerr << "Split corpus [" << argv[2] << "] failed!\n";
return -1;
}
return 0;
}
bool GetCorpusInfo(const char *file, uint32_t &total_len,
uint32_t &line_num, uint32_t &sent_num) {
total_len = line_num = sent_num = 0;
std::ifstream ifs(file);
if (!ifs.is_open()) {
std::cerr << "Open file [" << file << "] failed!\n";
return false;
}
std::string line;
while (getline(ifs, line)) {
++line_num;
if (line[0] == '\0' || line[0] == ' ' || line[0] == '\t') {
++sent_num;
continue;
}
size_t pos = line.find_first_of("\t ");
total_len += pos;
}
ifs.close();
return true;
}
bool SplitCorpus(const char *file, int part_num,
uint32_t avg_len_per_part, uint32_t avg_len_per_sent) {
// construct file name for partitions
std::ostringstream oss;
std::vector<std::string> p_file_name(part_num);
for (int i = 0; i < part_num; ++i) {
oss.str("");
oss << file << "_" << i;
p_file_name[i] = oss.str();
}
// construct ofstream for partitions
std::vector<std::ofstream *> ofs(part_num);
for (int i = 0; i < part_num; ++i) {
std::ofstream *o = new std::ofstream(p_file_name[i].c_str());
ofs[i] = o;
}
// split corpus
std::ifstream ifs(file);
if (!ifs.is_open()) {
std::cerr << "Open file [" << file << "] failed!\n";
return false;
}
int pid = 0;
uint32_t total_len = 0;
std::string line;
std::vector<std::string> sent_info;
while (getline(ifs, line)) {
sent_info.push_back(line);
if (line[0] == '\0' || line[0] == ' ' || line[0] == '\t') {
size_t sent_len = 0;
for (size_t i = 0; i < sent_info.size(); ++i) {
size_t pos = sent_info[i].find_first_of("\t ");
if (std::string::npos == pos) pos = 0;
sent_len += pos;
}
int next = pid + 1;
if (next >= part_num) next = part_num - 1;
int belong = -1;
if (total_len >= avg_len_per_part) {
belong = next;
total_len = sent_len;
++pid;
} else if (total_len + sent_len <= avg_len_per_part
|| total_len + sent_len <= avg_len_per_part + avg_len_per_sent) {
belong = pid;
total_len += sent_len;
} else {
belong = next;
total_len = sent_len;
++pid;
}
if (pid >= part_num) pid = part_num - 1;
for (size_t i = 0; i < sent_info.size(); ++i) {
(*ofs[belong]) << sent_info[i] << std::endl;
}
sent_info.clear();
}
}
ifs.close();
for (int i = 0; i < part_num; ++i) {
ofs[i]->close();
delete ofs[i];
}
// generate the blank part
oss.str("");
oss << file << "_" << part_num;
std::ofstream bofs(oss.str().c_str());
bofs.close();
return true;
}