Skip to content

Commit 53d83e5

Browse files
authored
Merge pull request #55 from Ajk4/multiple_input_files
Allow list of input files
2 parents 1481091 + 1657291 commit 53d83e5

File tree

4 files changed

+43
-19
lines changed

4 files changed

+43
-19
lines changed

src/configuration.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ pub struct Configuration {
3636
/// RAM we can support training with mmap files
3737
pub in_memory_embedding_calculation: bool,
3838

39-
/// Path to the input file
40-
pub input: String,
39+
/// Paths to the input files
40+
pub input: Vec<String>,
4141

4242
/// Type of the input file
4343
pub file_type: FileType,
@@ -86,7 +86,7 @@ impl Configuration {
8686
log_every_n: 1000,
8787
in_memory_embedding_calculation: true,
8888
file_type: FileType::Tsv,
89-
input,
89+
input: vec![input],
9090
output_dir: None,
9191
output_format: OutputFormat::TextFile,
9292
relation_name: String::from("emb"),

src/main.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,17 @@ fn main() {
2525
.version(crate_version!())
2626
.author(crate_authors!())
2727
.about(crate_description!())
28+
.arg(
29+
Arg::new("inputs")
30+
.multiple_values(true)
31+
.help("Input files paths")
32+
.takes_value(true),
33+
)
2834
.arg(
2935
Arg::new("input")
3036
.short('i')
3137
.long("input")
32-
.required(true)
33-
.help("Input file path")
38+
.help("Deprecated. Use positional args for input files")
3439
.takes_value(true),
3540
)
3641
.arg(
@@ -127,7 +132,22 @@ fn main() {
127132

128133
info!("Reading args...");
129134

130-
let input = matches.value_of("input").unwrap();
135+
let input: Vec<String> = {
136+
let named_arg = matches.value_of("input");
137+
let position_args = match matches.values_of("inputs") {
138+
None => vec![],
139+
Some(values) => values.into_iter().collect(),
140+
};
141+
position_args
142+
.into_iter()
143+
.chain(named_arg.into_iter())
144+
.map(|s| s.to_string())
145+
.collect()
146+
};
147+
if input.is_empty() {
148+
panic!("Missing input files")
149+
}
150+
131151
let file_type = match matches.value_of("file-type") {
132152
Some(type_name) => match type_name {
133153
"tsv" => configuration::FileType::Tsv,
@@ -192,7 +212,7 @@ fn main() {
192212
prepend_field: prepend_field_name,
193213
log_every_n: log_every,
194214
in_memory_embedding_calculation,
195-
input: input.to_string(),
215+
input,
196216
file_type,
197217
output_dir,
198218
output_format,

src/pipeline.rs

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,20 @@ pub fn build_graphs(
5050
match &config.file_type {
5151
FileType::Json => {
5252
let mut parser = dom::Parser::default();
53-
read_file(config, |line| {
54-
let row = parse_json_line(line, &mut parser, &config.columns);
55-
entity_processor.process_row(&row);
56-
});
53+
for input in config.input.iter() {
54+
read_file(input, config.log_every_n as u64, |line| {
55+
let row = parse_json_line(line, &mut parser, &config.columns);
56+
entity_processor.process_row(&row);
57+
});
58+
}
5759
}
5860
FileType::Tsv => {
59-
read_file(config, |line| {
60-
let row = parse_tsv_line(line);
61-
entity_processor.process_row(&row);
62-
});
61+
for input in config.input.iter() {
62+
read_file(input, config.log_every_n as u64, |line| {
63+
let row = parse_tsv_line(line);
64+
entity_processor.process_row(&row);
65+
});
66+
}
6367
}
6468
}
6569

@@ -77,11 +81,11 @@ pub fn build_graphs(
7781
}
7882

7983
/// Read file line by line. Pass every valid line to handler for parsing.
80-
fn read_file<F>(config: &Configuration, mut line_handler: F)
84+
fn read_file<F>(filepath: &str, log_every: u64, mut line_handler: F)
8185
where
8286
F: FnMut(&str),
8387
{
84-
let input_file = File::open(&config.input).expect("Can't open file");
88+
let input_file = File::open(filepath).expect("Can't open file");
8589
let mut buffered = BufReader::new(input_file);
8690

8791
let mut line_number = 1u64;
@@ -104,7 +108,7 @@ where
104108
// clear to reuse the buffer
105109
line.clear();
106110

107-
if line_number % config.log_every_n as u64 == 0 {
111+
if line_number % log_every == 0 {
108112
info!("Number of lines processed: {}", line_number);
109113
}
110114

tests/snapshot.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ fn prepare_config() -> Configuration {
8787
prepend_field: false,
8888
log_every_n: 10000,
8989
in_memory_embedding_calculation: true,
90-
input: "files/samples/edgelist_1.tsv".to_string(),
90+
input: vec!["files/samples/edgelist_1.tsv".to_string()],
9191
file_type: FileType::Tsv,
9292
output_format: OutputFormat::TextFile,
9393
output_dir: None,

0 commit comments

Comments
 (0)