From 9235609c2e56b960d9a3f6bb3eacfd54f10b6191 Mon Sep 17 00:00:00 2001 From: Lea Silakov Date: Sun, 1 Mar 2026 19:24:03 +0300 Subject: [PATCH 1/5] naive version (94s) --- app/Parser.php | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/app/Parser.php b/app/Parser.php index b74cf7b9f7..dc48b5cd69 100644 --- a/app/Parser.php +++ b/app/Parser.php @@ -8,6 +8,39 @@ final class Parser { public function parse(string $inputPath, string $outputPath): void { - throw new Exception('TODO'); + $out = []; + + $handle = fopen($inputPath, 'r'); + if ($handle === false) { + throw new Exception("Failed to open input file: {$inputPath}"); + } + + try { + while (($line = fgets($handle)) !== false) { + $line = rtrim($line, "\r\n"); + if ($line !== '') { + [$path, $date] = $this->parseLine($line); + if (!isset($out[$path])) $out[$path] = []; + $out[$path][$date] = ($out[$path][$date] ?? 0) + 1; + } + } + } finally { + fclose($handle); + } + + foreach ($out as $path => &$dates) { + ksort($dates); + } + + file_put_contents($outputPath, json_encode($out, JSON_PRETTY_PRINT)); + } + + private function parseLine(string $line): array + { + // skipping https://stitcher.io/, , it's all the same domain anyway + // also skipping 15ch of datetime + $line = substr($line, 19, -15); + + return explode(',', $line); } } \ No newline at end of file From 59492e985c7cd33d1dbb903b06d8839602573e13 Mon Sep 17 00:00:00 2001 From: Lea Silakov Date: Sun, 1 Mar 2026 19:58:34 +0300 Subject: [PATCH 2/5] better date processing and custom output (73s) --- app/Parser.php | 135 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 121 insertions(+), 14 deletions(-) diff --git a/app/Parser.php b/app/Parser.php index dc48b5cd69..d2d9751209 100644 --- a/app/Parser.php +++ b/app/Parser.php @@ -6,41 +6,148 @@ final class Parser { + private const int READ_CHUNK = 163_840; + private const int WRITE_BUF = 1_048_576; + public function parse(string $inputPath, string $outputPath): void { + gc_disable(); + + $fileSize = filesize($inputPath); + + $dateIds = []; + $dates = []; + $dateCount = 0; + for ($y = 21; $y <= 26; $y++) { + for ($m = 1; $m <= 12; $m++) { + $maxD = match ($m) { + 2 => $y === 24 ? 29 : 28, + 4, 6, 9, 11 => 30, + default => 31, + }; + $mStr = ($m < 10 ? '0' : '') . $m; + $ymStr = "{$y}-{$mStr}-"; + for ($d = 1; $d <= $maxD; $d++) { + $key = $ymStr . (($d < 10 ? '0' : '') . $d); + $dateIds[$key] = $dateCount; + $dates[$dateCount] = $key; + $dateCount++; + } + } + } + $out = []; - $handle = fopen($inputPath, 'r'); + $handle = fopen($inputPath, 'rb'); if ($handle === false) { throw new Exception("Failed to open input file: {$inputPath}"); } + + stream_set_read_buffer($handle, 0); try { - while (($line = fgets($handle)) !== false) { - $line = rtrim($line, "\r\n"); - if ($line !== '') { - [$path, $date] = $this->parseLine($line); - if (!isset($out[$path])) $out[$path] = []; - $out[$path][$date] = ($out[$path][$date] ?? 0) + 1; + $remaining = $fileSize; + + while ($remaining > 0) { + $toRead = $remaining > self::READ_CHUNK ? self::READ_CHUNK : $remaining; + $chunk = fread($handle, $toRead); + $chunkLen = strlen($chunk); + + if ($chunkLen === 0) break; + $remaining -= $chunkLen; + + $lastNl = strrpos($chunk, "\n"); + if ($lastNl === false) { + $this->processChunk($chunk, $dateIds, $out); + continue; + } + + $this->processChunk(substr($chunk, 0, $lastNl + 1), $dateIds, $out); + + $tail = $chunkLen - $lastNl - 1; + if ($tail > 0) { + fseek($handle, -$tail, SEEK_CUR); + $remaining += $tail; } } } finally { fclose($handle); } - foreach ($out as $path => &$dates) { - ksort($dates); + foreach ($out as $path => &$dateIdCounts) { + ksort($dateIdCounts); } + unset($dateIdCounts); - file_put_contents($outputPath, json_encode($out, JSON_PRETTY_PRINT)); + $this->jsonize($outputPath, $out, $dates); } - - private function parseLine(string $line): array + + private function processChunk(string $chunk, array $dateIds, array &$out): void { - // skipping https://stitcher.io/, , it's all the same domain anyway + $pos = 0; + $chunkLen = strlen($chunk); + + while ($pos < $chunkLen) { + $nlPos = strpos($chunk, "\n", $pos); + if ($nlPos === false) { + $line = substr($chunk, $pos); + if ($line !== '') { + $this->processLine($line, $dateIds, $out); + } + break; + } + + $line = substr($chunk, $pos, $nlPos - $pos); + if ($line !== '') { + $this->processLine($line, $dateIds, $out); + } + $pos = $nlPos + 1; + } + } + + private function processLine(string $line, array $dateIds, array &$out): void + { + // skipping https://stitcher.io/, it's all the same domain anyway // also skipping 15ch of datetime $line = substr($line, 19, -15); + + $commaPos = strpos($line, ','); + if ($commaPos === false) return; + + $path = substr($line, 0, $commaPos); + $dateFull = substr($line, $commaPos + 1, 10); + + $date = substr($dateFull, 2); + + $dateKey = $dateIds[$date] ?? $dateFull; + + if (!isset($out[$path])) { + $out[$path] = []; + } + $out[$path][$dateKey] = ($out[$path][$dateKey] ?? 0) + 1; + } + + private function jsonize($filename, &$out, array $dates) { + $file = fopen($filename, 'wb'); + stream_set_write_buffer($file, self::WRITE_BUF); + fwrite($file, '{'); + + $isFirst = true; + foreach ($out as $k => $ds) { + $buf = $isFirst ? "\n \"" : ",\n \""; + $buf .= str_replace('/', '\\/', $k)."\": {\n"; + $firstDate = true; + foreach ($ds as $dateId => $v) { + $dateStr = is_int($dateId) ? '20' . $dates[$dateId] : $dateId; + $buf .= ($firstDate ? '' : ",\n") . " \"$dateStr\": $v"; + $firstDate = false; + } + $buf .= "\n }"; + fwrite($file, $buf); + $isFirst = false; + } - return explode(',', $line); + fwrite($file, "\n}"); + fclose($file); } } \ No newline at end of file From 5f6a4aeba8c3b832522bc242998b4e4521904dac Mon Sep 17 00:00:00 2001 From: Lea Silakov Date: Fri, 13 Mar 2026 03:35:35 +0300 Subject: [PATCH 3/5] cleaned up the file read logic and global namespace for funcs (31.45s) --- app/Parser.php | 150 +++++++++++++++++-------------------------------- 1 file changed, 52 insertions(+), 98 deletions(-) diff --git a/app/Parser.php b/app/Parser.php index d2d9751209..5c2e6a040c 100644 --- a/app/Parser.php +++ b/app/Parser.php @@ -2,19 +2,18 @@ namespace App; -use Exception; - final class Parser { - private const int READ_CHUNK = 163_840; - private const int WRITE_BUF = 1_048_576; + private const READ_CHUNK = 163_840; + private const PREFIX_LEN = 25; // "https://stitcher.io/blog/" + private const WRITE_BUF = 1_048_576; public function parse(string $inputPath, string $outputPath): void { - gc_disable(); + \gc_disable(); + + $fileSize = \filesize($inputPath); - $fileSize = filesize($inputPath); - $dateIds = []; $dates = []; $dateCount = 0; @@ -38,116 +37,71 @@ public function parse(string $inputPath, string $outputPath): void $out = []; - $handle = fopen($inputPath, 'rb'); - if ($handle === false) { - throw new Exception("Failed to open input file: {$inputPath}"); - } - - stream_set_read_buffer($handle, 0); - - try { - $remaining = $fileSize; - - while ($remaining > 0) { - $toRead = $remaining > self::READ_CHUNK ? self::READ_CHUNK : $remaining; - $chunk = fread($handle, $toRead); - $chunkLen = strlen($chunk); - - if ($chunkLen === 0) break; - $remaining -= $chunkLen; - - $lastNl = strrpos($chunk, "\n"); - if ($lastNl === false) { - $this->processChunk($chunk, $dateIds, $out); - continue; - } - - $this->processChunk(substr($chunk, 0, $lastNl + 1), $dateIds, $out); - - $tail = $chunkLen - $lastNl - 1; - if ($tail > 0) { - fseek($handle, -$tail, SEEK_CUR); - $remaining += $tail; - } + $handle = \fopen($inputPath, 'rb'); + \stream_set_read_buffer($handle, 0); + $remaining = $fileSize; + + while ($remaining > 0) { + $toRead = $remaining > self::READ_CHUNK ? self::READ_CHUNK : $remaining; + $chunk = \fread($handle, $toRead); + $chunkLen = \strlen($chunk); + + if ($chunkLen === 0) break; + $remaining -= $chunkLen; + + $lastNl = \strrpos($chunk, "\n"); + if ($lastNl === false) break; + + $tail = $chunkLen - $lastNl - 1; + if ($tail > 0) { + \fseek($handle, -$tail, SEEK_CUR); + $remaining += $tail; + } + + $p = self::PREFIX_LEN; + while ($p < $lastNl) { + $sep = \strpos($chunk, ',', $p); + if ($sep === false || $sep >= $lastNl) break; + $slug = \substr($chunk, $p, $sep - $p); + $date = \substr($chunk, $sep + 3, 8); + $dateKey = $dateIds[$date] ?? ('20' . $date); + if (!isset($out[$slug])) $out[$slug] = []; + $out[$slug][$dateKey] = ($out[$slug][$dateKey] ?? 0) + 1; + $p = $sep + 52; } - } finally { - fclose($handle); } - foreach ($out as $path => &$dateIdCounts) { - ksort($dateIdCounts); + \fclose($handle); + + foreach ($out as &$dateIdCounts) { + \ksort($dateIdCounts); } unset($dateIdCounts); $this->jsonize($outputPath, $out, $dates); } - - private function processChunk(string $chunk, array $dateIds, array &$out): void - { - $pos = 0; - $chunkLen = strlen($chunk); - - while ($pos < $chunkLen) { - $nlPos = strpos($chunk, "\n", $pos); - if ($nlPos === false) { - $line = substr($chunk, $pos); - if ($line !== '') { - $this->processLine($line, $dateIds, $out); - } - break; - } - - $line = substr($chunk, $pos, $nlPos - $pos); - if ($line !== '') { - $this->processLine($line, $dateIds, $out); - } - $pos = $nlPos + 1; - } - } - - private function processLine(string $line, array $dateIds, array &$out): void - { - // skipping https://stitcher.io/, it's all the same domain anyway - // also skipping 15ch of datetime - $line = substr($line, 19, -15); - - $commaPos = strpos($line, ','); - if ($commaPos === false) return; - - $path = substr($line, 0, $commaPos); - $dateFull = substr($line, $commaPos + 1, 10); - - $date = substr($dateFull, 2); - - $dateKey = $dateIds[$date] ?? $dateFull; - - if (!isset($out[$path])) { - $out[$path] = []; - } - $out[$path][$dateKey] = ($out[$path][$dateKey] ?? 0) + 1; - } private function jsonize($filename, &$out, array $dates) { - $file = fopen($filename, 'wb'); - stream_set_write_buffer($file, self::WRITE_BUF); - fwrite($file, '{'); + $file = \fopen($filename, 'wb'); + \stream_set_write_buffer($file, self::WRITE_BUF); + \fwrite($file, '{'); $isFirst = true; foreach ($out as $k => $ds) { - $buf = $isFirst ? "\n \"" : ",\n \""; - $buf .= str_replace('/', '\\/', $k)."\": {\n"; + $buf = $isFirst ? "\n " : ",\n "; + $isFirst = false; + $buf .= "\"\\/blog\\/" . \str_replace('/', '\\/', $k) . "\": {\n"; $firstDate = true; foreach ($ds as $dateId => $v) { - $dateStr = is_int($dateId) ? '20' . $dates[$dateId] : $dateId; + $dateStr = \is_int($dateId) ? '20' . $dates[$dateId] : $dateId; $buf .= ($firstDate ? '' : ",\n") . " \"$dateStr\": $v"; $firstDate = false; } $buf .= "\n }"; - fwrite($file, $buf); - $isFirst = false; + \fwrite($file, $buf); } - fwrite($file, "\n}"); - fclose($file); + \fwrite($file, "\n}"); + \fclose($file); } -} \ No newline at end of file +} From 9e3026202b9b4bd45decd2d44525aa3e38eeeddb Mon Sep 17 00:00:00 2001 From: Lea Silakov Date: Fri, 13 Mar 2026 03:45:49 +0300 Subject: [PATCH 4/5] binary packing dates and paths, also jsonize changes (15.9s) --- app/Parser.php | 80 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/app/Parser.php b/app/Parser.php index 5c2e6a040c..36e79f72fa 100644 --- a/app/Parser.php +++ b/app/Parser.php @@ -2,9 +2,12 @@ namespace App; +ini_set('memory_limit', '8192M'); + final class Parser { private const READ_CHUNK = 163_840; + private const PATH_SCAN_SIZE = 2_097_152; private const PREFIX_LEN = 25; // "https://stitcher.io/blog/" private const WRITE_BUF = 1_048_576; @@ -35,7 +38,38 @@ public function parse(string $inputPath, string $outputPath): void } } - $out = []; + $dateIdBytes = []; + foreach ($dateIds as $date => $id) { + $dateIdBytes[$date] = \chr($id & 0xFF) . \chr($id >> 8); + } + + $handle = \fopen($inputPath, 'rb'); + \stream_set_read_buffer($handle, 0); + $raw = \fread($handle, \min(self::PATH_SCAN_SIZE, $fileSize)); + \fclose($handle); + + $pathIds = []; + $paths = []; + $pathCount = 0; + $pos = 0; + $lastNl = \strrpos($raw, "\n") ?: 0; + + while ($pos < $lastNl) { + $nlPos = \strpos($raw, "\n", $pos + 52); + if ($nlPos === false) break; + + $slug = \substr($raw, $pos + self::PREFIX_LEN, $nlPos - $pos - 51); + if (!isset($pathIds[$slug])) { + $pathIds[$slug] = $pathCount; + $paths[$pathCount] = $slug; + $pathCount++; + } + + $pos = $nlPos + 1; + } + unset($raw); + + $buckets = \array_fill(0, $pathCount, ''); $handle = \fopen($inputPath, 'rb'); \stream_set_read_buffer($handle, 0); @@ -63,41 +97,45 @@ public function parse(string $inputPath, string $outputPath): void $sep = \strpos($chunk, ',', $p); if ($sep === false || $sep >= $lastNl) break; $slug = \substr($chunk, $p, $sep - $p); - $date = \substr($chunk, $sep + 3, 8); - $dateKey = $dateIds[$date] ?? ('20' . $date); - if (!isset($out[$slug])) $out[$slug] = []; - $out[$slug][$dateKey] = ($out[$slug][$dateKey] ?? 0) + 1; + if (!isset($pathIds[$slug])) { + $pathIds[$slug] = $pathCount; + $paths[$pathCount] = $slug; + $buckets[$pathCount] = ''; + $pathCount++; + } + $buckets[$pathIds[$slug]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; $p = $sep + 52; } } \fclose($handle); - foreach ($out as &$dateIdCounts) { - \ksort($dateIdCounts); - } - unset($dateIdCounts); - - $this->jsonize($outputPath, $out, $dates); + $this->jsonize($outputPath, $buckets, $paths, $dates, $dateCount); } - private function jsonize($filename, &$out, array $dates) { + private function jsonize(string $filename, array $buckets, array $paths, array $dates, int $dateCount): void { + $datePrefixes = \array_map(fn($d) => ' "20' . $d . '": ', $dates); + $escapedPaths = \array_map(fn($p) => "\"\\/blog\\/" . \str_replace('/', '\\/', $p) . '"', $paths); + $file = \fopen($filename, 'wb'); \stream_set_write_buffer($file, self::WRITE_BUF); \fwrite($file, '{'); $isFirst = true; - foreach ($out as $k => $ds) { + foreach ($paths as $pid => $path) { + if ($buckets[$pid] === '') continue; + + $hits = \array_count_values(\unpack('v*', $buckets[$pid])); + $dateEntries = []; + for ($d = 0; $d < $dateCount; $d++) { + if (isset($hits[$d])) + $dateEntries[] = $datePrefixes[$d] . $hits[$d]; + } + if (!$dateEntries) continue; + $buf = $isFirst ? "\n " : ",\n "; $isFirst = false; - $buf .= "\"\\/blog\\/" . \str_replace('/', '\\/', $k) . "\": {\n"; - $firstDate = true; - foreach ($ds as $dateId => $v) { - $dateStr = \is_int($dateId) ? '20' . $dates[$dateId] : $dateId; - $buf .= ($firstDate ? '' : ",\n") . " \"$dateStr\": $v"; - $firstDate = false; - } - $buf .= "\n }"; + $buf .= $escapedPaths[$pid] . ": {\n" . \implode(",\n", $dateEntries) . "\n }"; \fwrite($file, $buf); } From d5268d91c612331800328495d6920730e067ddd1 Mon Sep 17 00:00:00 2001 From: Lea Silakov Date: Fri, 13 Mar 2026 03:47:39 +0300 Subject: [PATCH 5/5] unwrapped the loop (14.3s) --- app/Parser.php | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/app/Parser.php b/app/Parser.php index 36e79f72fa..5447179ed8 100644 --- a/app/Parser.php +++ b/app/Parser.php @@ -93,6 +93,42 @@ public function parse(string $inputPath, string $outputPath): void } $p = self::PREFIX_LEN; + $fence = $lastNl - 800; + + while ($p < $fence) { + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + + $sep = \strpos($chunk, ',', $p); + $buckets[$pathIds[\substr($chunk, $p, $sep - $p)]] .= $dateIdBytes[\substr($chunk, $sep + 3, 8)]; + $p = $sep + 52; + } + while ($p < $lastNl) { $sep = \strpos($chunk, ',', $p); if ($sep === false || $sep >= $lastNl) break;