Allow subcommands

OpenTRFoundation · Dec 28, 2023 · 2eb6052 · 2eb6052
1 parent 87273cc
commit 2eb6052
Show file tree

Hide file tree

Showing 13 changed files with 575 additions and 190 deletions.
diff --git a/README.md b/README.md
@@ -33,19 +33,34 @@ This design allows the process to be fully extensible. You can write your own co
 npm install -g @opentr/cuttlecat
 ```
 
-## Running cuttlecat
+## Usage
 
+<!---
+node dist/index.js --help
+--->
 ```shell
-cuttlecat --command-file=<your command file> --data-directory=<data directory> --github-token=<github token>
+Usage: cuttlecat <command> [options]
+
+Commands:
+  cuttlecat execute                  Execute the command within the given file and store the result.
+  cuttlecat latest-process-complete  Checks if the latest queue is marked as complete and prints the result in the stdout.
+  cuttlecat requeue-tasks          Manually requeue tasks for trying them again.
+
+
+Options:
+  --help     Show help  [boolean]
+  --version  Show version number  [boolean]
 ```
 
-### Usage
+## Command `execute`
 
 <!---
-node dist/index.js --help
+node dist/index.js execute --help
 --->
 ```shell
-Usage: cuttlecat --command-file=<your command file> [options]
+Usage: cuttlecat execute [options]
+Run cuttlecat --help for help on common options.
+Execute the command within the given file and store the result.
 
 Required options
   --command-file    Command file to load.  [string] [required]
@@ -54,6 +69,7 @@ Required options
 
 Options:
   --help                     Show help  [boolean]
+  --version                  Show version number  [boolean]
   --renew-period-in-days     Number of days to wait until creating a new queue after the latest one is completed.  [number] [default: 7]
   --concurrency              Number of concurrent tasks to process the queue. As this search is IO bound and CPU bound, there can be many concurrent tasks (more than the number of cores). However, because of the rate limiting, there will be a lot of idle tasks. So, it is recommended to keep concurrency low.  [number] [default: 6]
   --per-task-timeout-in-ms   Timeout in milliseconds for each task in the queue.Keeping the timeout too long will end up using too many GitHub actions minutes.Keeping the timeout too short will result in too many errored items.  [number] [default: 30000]
@@ -75,14 +91,14 @@ Examples:
   --report-period-in-ms=5000                               Print the queue state to stdout every 5 seconds. This is useful to see how many tasks are in the queue, how many are completed, how many are errored, etc.
 ```
 
-### Running the sample command
+### Running the sample search command
 
 To run the sample command:
 ```shell
 rm -rf /tmp/foo/bar
 mkdir -p /tmp/foo/bar
 
-cuttlecat --command-file="./test/test_tasks/basicUserSearch.js" \
+cuttlecat execute --command-file="../test/test_tasks/basicUserSearch.js" \
     --data-directory="/tmp/foo/bar" \
     --github-token="$(gh auth token)"
 ```
@@ -92,6 +108,58 @@ The output will be stored in `/tmp/foo/bar` directory.
 
 See [`src/test/test_tasks/basicUserSearch.ts`](src/test/test_tasks/basicUserSearch.ts) for the implementation of the sample command.
 
+## Command `latest-queue-complete`
+
+<!---
+node dist/index.js latest-queue-complete --help
+--->
+```shell
+Usage: cuttlecat latest-queue-complete [options]
+Run cuttlecat --help for help on common options.
+Checks if the latest queue is marked as complete and prints the result in the stdout.
+
+Options:
+  --help            Show help  [boolean]
+  --version         Show version number  [boolean]
+  --data-directory  Data directory to check the process files.  [string] [required]
+
+Examples:
+  NOTE:                                     Examples below are not executable commands, they are just examples of how to use the command.
+  --data-directory=/path/to/data/directory  Check if the latest state file in the given directory was complete. After you start another queue that produces a state file, you can run this command to check if it is complete. This command writes true or false to stdout, which can beused in a script to determine if the previous queue was done.
+```
+
+Example execution:
+```shell
+$ cuttlecat latest-queue-complete --data-directory="/tmp/foo/bar"
+true
+```
+
+## Command `requeue-tasks`
+
+<!---
+node dist/index.js requeue-tasks --help
+--->
+```shell
+Usage: cuttlecat requeue-tasks [options]
+Run cuttlecat --help for help on common options.
+Manually requeue tasks for trying them again.
+
+Options:
+  --help            Show help  [boolean]
+  --version         Show version number  [boolean]
+  --requeue-type    Type of tasks to requeue. 'errored' will requeue all errored tasks. 'non-critical-errored' will requeue tasks that are not in the `errored` bucket, but resolved with non-critical errors.  [required] [choices: "errored", "non-critical-errored"]
+  --data-directory  Data directory to for the task states and outputs.  [string] [required]
+  --timestamp       Directory name under data-directory.  [string] [required]
+```
+
+Example execution:
+```shell
+node dist/index.js requeue-tasks \
+    --requeue-type="non-critical-errored" \
+    --data-directory="/tmp/foo/bar" \
+    --timestamp="1234"
+```
+
 ## Implement your own search command
 
 To implement your own search command, you need to create a command file. The command file is a JavaScript file that exports a class that implements the [`Command` interface](src/graphql/command.ts).

diff --git a/src/arguments.ts b/src/arguments.ts
@@ -10,164 +10,10 @@ type ExtractOptionsType<YargsType> = YargsType extends Argv<infer Type> ? Type :
 // this is useful, because this would provide compile time checking when using the options.
 export type GetBuiltOptionsType<T> = ArgumentsCamelCase<ExtractOptionsType<GetReturnType<T>>>
 
-export type Args = GetBuiltOptionsType<typeof addArguments>;
-
-const REQUIRED_OPTIONS_GROUP = "Required options";
-
 export function getYargs() {
     return yargs(process.argv.slice(2))
-        .usage("Usage: $0 --command-file=<command file> [options]")
+        .usage("Usage: $0 <command> [options]")
         .strictCommands()
-        .wrap(null)
-        .version(false);
-}
-
-export function addArguments(y:Argv<any>) {
-    return y
-        .example(
-            "--data-directory=/path/to/directory",
-            "Store the state of the process and the output in /path/to/directory, so that subsequent executions of the same command can be resumed."
-        )
-        .example(
-            "--renew-period-in-days=7",
-            "If the process is complete (all search periods are processed), don't start a new search until 7 days has passed after the latest completion."
-        )
-        .example(
-            "--concurrency=6 --interval-cap=4 --interval-in-ms=20000",
-            "Start 6 concurrent tasks each time, and execute 4 tasks in every 20 seconds. (change these to avoid hitting GitHub secondary rate limits)"
-        )
-        .example(
-            "--retry-count=3",
-            "When a task fails, retry 3 times (in total, 4 times). If it still fails, process will create tasks that have narrower scopes. If the task's scope can be " +
-            "narrowed down, then the task will be archived. If not, it will stay in the errored list. This narrowing down will also happen for any narrowed-down tasks " +
-            "that fail (tried 4 times in total), until they cannot be narrowed down anymore. " +
-            "For the commands that use a date range to search for, tasks for shorter search ranges will be created that in total wrap the " +
-            "failing task's search range."
-        )
-        .example(
-            "--per-task-timeout-in-ms=30000",
-            "For each task, wait for 30 seconds before timing out. You change this to avoid spending too much GitHub action minutes. If the timeout" +
-            "is too short, there will be too many errored items. However, the process will retry and create narrower scoped tasks for errored items, so, having a " +
-            "very long timeout is not very useful."
-        )
-        .example(
-            "--report-period-in-ms=5000",
-            "Print the queue state to stdout every 5 seconds. This is useful to see how many tasks are in the queue, how many are completed, how many are errored, etc. "
-        )
-        .options({
-            "command-file": {
-                type: "string",
-                desc: "Command file to load.",
-                demandOption: true,
-                group: REQUIRED_OPTIONS_GROUP,
-            },
-            "data-directory": {
-                type: "string",
-                desc: "Data directory to read and store the output.",
-                demandOption: true,
-                group: REQUIRED_OPTIONS_GROUP,
-            },
-            "github-token": {
-                type: "string",
-                desc: "GitHub API token. Token might need permissions based on your task.",
-                demandOption: true,
-                group: REQUIRED_OPTIONS_GROUP,
-            },
-
-            // optional stuff
-            "renew-period-in-days": {
-                type: "number",
-                desc: "Number of days to wait until creating a new queue after the latest one is completed.",
-                default: 7,
-            },
-            "concurrency": {
-                type: "number",
-                desc:
-                    "Number of concurrent tasks to process the queue. " +
-                    "As this search is IO bound and CPU bound, there can be many concurrent tasks (more than the number of cores). " +
-                    "However, because of the rate limiting, there will be a lot of idle tasks. " +
-                    "So, it is recommended to keep concurrency low.",
-                default: 6,
-            },
-            "per-task-timeout-in-ms": {
-                type: "number",
-                desc:
-                    "Timeout in milliseconds for each task in the queue." +
-                    "Keeping the timeout too long will end up using too many GitHub actions minutes." +
-                    "Keeping the timeout too short will result in too many errored items.",
-                default: 30000,
-            },
-            // About rate limits...
-            // ref1: https://docs.github.com/en/free-pro-team@latest/rest/search/search?apiVersion=2022-11-28#search-users
-            // ref2: https://docs.github.com/en/free-pro-team@latest/rest/search/search?apiVersion=2022-11-28#rate-limit
-            // ref3: https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#rate-limits-for-requests-from-personal-accounts
-            // ref4: https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#rate-limits-for-requests-from-github-actions
-            // Numbers:
-            // The REST API has a custom rate limit for searching. ... you can make up to 30 requests per minute
-            // User access token requests are limited to 5,000 requests per hour ...
-            // When using GITHUB_TOKEN, the rate limit is 1,000 requests per hour per repository.
-            //
-            // Bottleneck is search endpoint, which is limited to 30 requests per minute.
-            // And the worst part is, that it's not reported by the RateLimit object in GraphQL response.
-            // We only know when we reached the limit.
-            // The queue will abort when primary (1000 requests per hour) or secondary (30 requests per minute) rate limit is reached.
-            // So that we can retry later, instead of waiting and using the GitHub action minutes.
-            //
-            // Another note is that instead of using an interval of 60 seconds and a cap of 30, we should use shorter intervals and a lower cap.
-            // Otherwise, what happens is that queue will execute 30 tasks in ~10 seconds, and then wait for 50 seconds.
-            // That's a burst-y behavior, and we should avoid that.
-            // A good number to start with is 10 seconds and 5 tasks.
-            //
-            // Finally, let's leave some gap for the secondary rate limit.
-            // Instead of 10 seconds and 5 tasks, let's use 12 seconds and 4 tasks (means 20 reqs/sec).
-            //
-            // These numbers can be overridden by env vars.
-            "rate-limit-stop-percent": {
-                type: "number",
-                desc: "Under this rate limit remaining percent, stop the queue.",
-                default: 10,
-            },
-            "interval-cap": {
-                type: "number",
-                desc: "Max number of tasks to execute in the given interval by interval-in-ms.",
-                default: 4,
-            },
-            "interval-in-ms": {
-                type: "number",
-                desc: "Interval for the cap in milliseconds.",
-                default: 20000,
-            },
-            "retry-count": {
-                type: "number",
-                desc: "Number of retries for each task before giving up of creating narrower scoped tasks.",
-                default: 3,
-            },
-
-            // debug related stuff
-            "record-http-calls": {
-                type: "boolean",
-                desc:
-                    "Record HTTP calls to disk for debugging purposes. " +
-                    "\"Nock back\" will be used in `record` mode where the new records will be created. " +
-                    "The calls will be stored in the `./nock-records/` directory, relative to the command path.",
-                default: false,
-            },
-            "log-level": {
-                type: "string",
-                desc: "Log level to use.",
-                default: "info",
-            },
-            "max-run-time-in-minutes": {
-                type: "number",
-                desc: "When to stop the command gracefully. For example GitHub Actions has a 3 hour limit and " +
-                    "when it cancels, nothing is saved. However, GitHub sometimes cancels before the limit to " +
-                    "possibly make rooms for other systems/actions, so set it a bit lower than the limit.",
-                default: 60, // default to 1 hour
-            },
-            "report-period-in-ms": {
-                type: "number",
-                desc: "Period in milliseconds to print the queue state to stdout (0 for disabled)",
-                default: 5000,
-            },
-        });
+        .demandCommand(1, 1, "You need to specify a command.", "You can only specify one command.")
+        .wrap(null);
 }
diff --git a/src/index.ts b/src/index.ts
@@ -1,6 +1,47 @@
 #!/usr/bin/env node
 
-import {main} from "./main.js";
+import {getYargs} from "./arguments.js";
+import {SubCommand} from "./subcommand.js";
+
+import {CommandDefinition as ExecuteCommandDefinition} from "./subcommand/execute.js";
+import {CommandDefinition as LatestQueueCompleteCommandDefinition} from "./subcommand/latestQueueComplete.js";
+import {CommandDefinition as RequeueTasksCommandDefinition} from "./subcommand/requeueTasks.js";
+
+function buildCommands() {
+    // NOTE: keep sorted
+    const commands:{ [key:string]:SubCommand } = {
+        [ExecuteCommandDefinition.commandName]: ExecuteCommandDefinition,
+        [LatestQueueCompleteCommandDefinition.commandName]: LatestQueueCompleteCommandDefinition,
+        [RequeueTasksCommandDefinition.commandName]: RequeueTasksCommandDefinition,
+    };
+
+    return commands;
+}
+
+async function main() {
+    const subCommands = buildCommands();
+
+    let y = getYargs();
+
+    for (const subCommand of Object.values(subCommands)) {
+        y = y.command(
+            subCommand.commandName,
+            subCommand.commandDescription,
+            (y_cmd) => {
+                y_cmd = y_cmd
+                    .usage(`Usage: $0 ${subCommand.commandName} [options]`)
+                    .usage(`Run $0 --help for help on common options.`)
+                    .usage(subCommand.commandDescription)
+                subCommand.addArguments(y_cmd)
+            });
+    }
+
+    const args = y.parseSync();
+
+    const commandToRun = args._[0];
+
+    await subCommands[commandToRun].main(args);
+}
 
 (async () => {
     await main();

diff --git a/src/subcommand.ts b/src/subcommand.ts
@@ -0,0 +1,8 @@
+import {Argv} from "yargs";
+
+export type SubCommand = {
+    readonly commandName:string,
+    readonly commandDescription:string,
+    addArguments:(y:Argv) => Argv,
+    readonly main:(args:any) => Promise<void>,
+}
diff --git a/src/main.test.ts → src/subcommand/execute.test.ts b/src/main.test.ts → src/subcommand/execute.test.ts
@@ -2,19 +2,19 @@ import {graphql} from "@octokit/graphql";
 import {expect} from "chai";
 import mockfs, {restore as mockfsRestore} from "mock-fs";
 
-import {TaskContext} from "./graphql/context.js";
-import {FakeCommand, fakeNow, FakeResult, FakeTaskSpec} from "./graphql/fake.test.js";
-import * as log from "./log.js";
+import {TaskContext} from "../graphql/context.js";
+import {FakeCommand, fakeNow, FakeResult, FakeTaskSpec} from "../graphql/fake.test.js";
+import * as log from "../log.js";
+import {ProcessFileHelper} from "../processFileHelper.js";
+import {ErroredTask, TaskQueue} from "../queue/taskqueue.js";
+import {formatDate} from "../utils.js";
 import {
     addErroredToUnresolved,
     checkFileCompleted,
     getOrCreateLatestProcessState,
     initializeQueue,
     ProcessState
-} from "./main.js";
-import {ProcessFileHelper} from "./processFileHelper.js";
-import {ErroredTask, TaskQueue} from "./queue/taskqueue.js";
-import {formatDate} from "./utils.js";
+} from "./execute.js";
 
 const logger = log.createLogger("index/test");