Skip to content

Commit

Permalink
Moved windows glob() over to dev/unistd.h
Browse files Browse the repository at this point in the history
Added header guard and changed long->int64_t in dataloader.h
  • Loading branch information
rosslwheeler committed May 24, 2024
1 parent 1ec081e commit e5083be
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 93 deletions.
106 changes: 14 additions & 92 deletions dataloader.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,88 +15,10 @@ Implements a medium simple DataLoader for a distributed training setup.
#include "utils.h"

// ----------------------------------------------------------------------------
// implementation of glob for Windows
// implementation of glob for Windows is in dev/unistd.h
#ifndef _WIN32
#include <glob.h>
#else

typedef struct glob_t {
size_t gl_pathc; // Count of matched pathnames
char **gl_pathv; // List of matched pathnames
} glob_t;

void replace_forward_slashes(char* str) {
while (*str) {
if (*str == '/') {
*str = '\\';
}
str++;
}
}

void globfree(glob_t *pglob) {
for (size_t i = 0; i < pglob->gl_pathc; ++i) {
free(pglob->gl_pathv[i]); // Free the allocated memory for each filename
}
free(pglob->gl_pathv); // Free the allocated memory for the list of filenames
}

int glob(const char* pattern, int ignored_flags, int (*ignored_errfunc)(const char* epath, int eerrno), glob_t* pglob){
struct _finddata_t find_file_data;
char full_path[576]; // stored in pglob->gl_pathv[n]
char directory_path[512] = {0}; // Store the directory path from the pattern
char pattern_copy[512]; // Copy of the pattern to modify

strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);

replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes

if (strchr(pattern_copy, '\\') != NULL) {
strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
}

// find the first file matching the pattern in the directory
intptr_t find_handle = _findfirst(pattern_copy, &find_file_data);

if (find_handle == -1) {
return 1; // No files found
}

size_t file_count = 0;
size_t max_files = 64000; // hard-coded limit for the number of files

pglob->gl_pathv = (char **) malloc(max_files * sizeof(char*)); // freed in globfree

if (pglob->gl_pathv == NULL) {
_findclose(find_handle);
return 2; // Memory allocation failed
}

do {
if (file_count >= max_files) {
_findclose(find_handle);
return 2; // Too many files found
}

snprintf(full_path, sizeof(full_path), "%s%s", directory_path, find_file_data.name);

pglob->gl_pathv[file_count] = _strdup(full_path); // freed in globfree

if (pglob->gl_pathv[file_count] == NULL) {
_findclose(find_handle);
return 2; // Memory allocation for filename failed
}
file_count++;
} while (_findnext(find_handle, &find_file_data) == 0);

_findclose(find_handle);

pglob->gl_pathc = file_count;
return 0;
}
#endif

// ----------------------------------------------------------------------------
// Distributed Data Loader
#define HEADER_SIZE 256
Expand All @@ -113,16 +35,16 @@ typedef struct {
glob_t glob_result; // stores the result of glob, for all shards we want to iterate
int current_shard; // the current shard we are reading from
FILE* tokens_file;
long file_size;
long current_position;
int64_t file_size;
int64_t current_position;
uint16_t* buffer; // we fread data from file into this buffer
// public variables that could be accessed from outside
size_t num_batches;
int* inputs; // input tokens into transformer
int* targets; // target tokens for the transformer
} DataLoader;

long dataloader_load_shard_(DataLoader *loader, int shard_index) {
int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) {
// use the first glob match as the filename for now
const char* filename = loader->glob_result.gl_pathv[shard_index];
// open the input file for reading. also only a single file can be opened at a time
Expand All @@ -140,14 +62,14 @@ long dataloader_load_shard_(DataLoader *loader, int shard_index) {
exit(EXIT_FAILURE);
}
if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
long ntok = header[2]; // number of tokens in the file
int64_t ntok = header[2]; // number of tokens in the file
assert(ntok > 0); // we expect some tokens in the file. this should never trip, right?
// determine the file size and make sure it is consistent with the number of tokens
fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file
loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size
fseekCheck(loader->tokens_file, 0, SEEK_SET); // seek back to the beginning
// we expect ntok in the file to be consistent with filesize, assert that is the case
long expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
int64_t expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
if (loader->file_size != expected_file_size) {
printf("Error: file size is not as expected\n");
exit(EXIT_FAILURE);
Expand All @@ -158,8 +80,8 @@ long dataloader_load_shard_(DataLoader *loader, int shard_index) {
void dataloader_reset(DataLoader *loader) {
// fully resets the DataLoader object to init configuration
// each process starts at a different offset in the file
long header_bytes = HEADER_SIZE * sizeof(int);
long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
int64_t header_bytes = HEADER_SIZE * sizeof(int);
int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
loader->current_shard = 0;
loader->current_position = header_bytes + token_bytes_offset;
dataloader_load_shard_(loader, loader->current_shard);
Expand All @@ -172,8 +94,8 @@ void dataloader_advance_(DataLoader *loader) {
loader->current_shard = (loader->current_shard + 1) % loader->glob_result.gl_pathc;
dataloader_load_shard_(loader, loader->current_shard);
}
long header_bytes = HEADER_SIZE * sizeof(int);
long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
int64_t header_bytes = HEADER_SIZE * sizeof(int);
int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
loader->current_position = header_bytes + token_bytes_offset;
}

Expand Down Expand Up @@ -202,9 +124,9 @@ void dataloader_init(DataLoader *loader,

// inspect and validate all shards so we don't get any runtime errors later
// if too slow / too many shards, may wish to revisit later
long ntok_total = 0;
int64_t ntok_total = 0;
for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++) {
long shard_ntok = dataloader_load_shard_(loader, shard_index);
int64_t shard_ntok = dataloader_load_shard_(loader, shard_index);
// we need at least one batch/shard, the way things are written right now.
// can be relaxed a lot later.
assert(shard_ntok >= num_processes * B * T + 1);
Expand Down Expand Up @@ -286,7 +208,7 @@ typedef struct {
size_t T; // maximum context length of the model
// input handling and its state
FILE* eval_file;
long file_size;
int64_t file_size;
uint16_t* buffer; // we fread data from file into this buffer
// public variables that could be accessed from outside
int num_examples; // in total across all processes
Expand Down Expand Up @@ -318,7 +240,7 @@ void evalloader_reset(EvalLoader *loader) {
}
// now seek through the file to the start of that example
// utilize <EXAMPLE_BYTES> for efficiency
long header_bytes = HEADER_SIZE * sizeof(int);
int64_t header_bytes = HEADER_SIZE * sizeof(int);
fseekCheck(loader->eval_file, header_bytes, SEEK_SET);
for (int i = 0; i < loader->start_example_index; i++) {
uint16_t example_header[3];
Expand Down
82 changes: 81 additions & 1 deletion dev/unistd.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
#define _CRT_SECURE_NO_WARNINGS
#define _USE_MATH_DEFINES

#include <stdio.h>

#include <math.h>
//#define gen_max_length 64 // compile as C++ to skip this VLA issue
#include <time.h>

#define CLOCK_MONOTONIC 0
int clock_gettime(int ignore_variable, struct timespec* tv)
static inline int clock_gettime(int ignore_variable, struct timespec* tv)
{
return timespec_get(tv, TIME_UTC); // TODO: not sure this is the best solution. Need to review.
}
Expand All @@ -23,4 +25,82 @@ int clock_gettime(int ignore_variable, struct timespec* tv)
#define TURN_OFF_FP_FAST __pragma(float_control( precise, on, push )) // Save current setting and turn on /fp:precise
#define TURN_ON_FP_FAST __pragma(float_control(pop)) // Restore file's default settings

#define _mkdir mkdir // add mkdir into namespace for windows

typedef struct glob_t {
size_t gl_pathc; // Count of matched pathnames
char **gl_pathv; // List of matched pathnames
} glob_t;

static inline void replace_forward_slashes(char* str) {
while (*str) {
if (*str == '/') {
*str = '\\';
}
str++;
}
}

static inline void globfree(glob_t *pglob) {
for (size_t i = 0; i < pglob->gl_pathc; ++i) {
free(pglob->gl_pathv[i]); // Free the allocated memory for each filename
}
free(pglob->gl_pathv); // Free the allocated memory for the list of filenames
}

static inline int glob(const char* pattern, int ignored_flags, int (*ignored_errfunc)(const char* epath, int eerrno), glob_t* pglob){
struct _finddata_t find_file_data;
char full_path[576]; // stored in pglob->gl_pathv[n]
char directory_path[512] = {0}; // Store the directory path from the pattern
char pattern_copy[512]; // Copy of the pattern to modify

strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);

replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes

if (strchr(pattern_copy, '\\') != NULL) {
strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
}

// find the first file matching the pattern in the directory
intptr_t find_handle = _findfirst(pattern_copy, &find_file_data);

if (find_handle == -1) {
return 1; // No files found
}

size_t file_count = 0;
size_t max_files = 64000; // hard-coded limit for the number of files

pglob->gl_pathv = (char **) malloc(max_files * sizeof(char*)); // freed in globfree

if (pglob->gl_pathv == NULL) {
_findclose(find_handle);
return 2; // Memory allocation failed
}

do {
if (file_count >= max_files) {
_findclose(find_handle);
return 2; // Too many files found
}

snprintf(full_path, sizeof(full_path), "%s%s", directory_path, find_file_data.name);

pglob->gl_pathv[file_count] = _strdup(full_path); // freed in globfree

if (pglob->gl_pathv[file_count] == NULL) {
_findclose(find_handle);
return 2; // Memory allocation for filename failed
}
file_count++;
} while (_findnext(find_handle, &find_file_data) == 0);

_findclose(find_handle);

pglob->gl_pathc = file_count;
return 0;
}

#endif

0 comments on commit e5083be

Please sign in to comment.