From bc327ccc8b705e3c83c538957c3abd9da8a099f4 Mon Sep 17 00:00:00 2001 From: James Hayhurst Date: Tue, 15 Jul 2025 16:04:16 +0100 Subject: [PATCH 1/2] add intervals data to clickhouse update intervals schema --- config/clickhouse/schema/intervals_log.sql | 20 ++++++++++++++++++++ config/clickhouse/scripts/intervals.sql | 12 ++++++++++++ config/config.yaml | 1 + config/datasets.yaml | 4 ++++ deployment/variables.tf | 2 +- 5 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 config/clickhouse/schema/intervals_log.sql create mode 100644 config/clickhouse/scripts/intervals.sql diff --git a/config/clickhouse/schema/intervals_log.sql b/config/clickhouse/schema/intervals_log.sql new file mode 100644 index 0000000..291ab8b --- /dev/null +++ b/config/clickhouse/schema/intervals_log.sql @@ -0,0 +1,20 @@ +create database if not exists ot; + +CREATE TABLE if not exists ot.intervals_log +( + `chromosome` Enum8('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'), + `start` UInt32, + `end` UInt32, + `geneId` String, + `biosampleName` LowCardinality(String), + `biosampleId` LowCardinality(String), + `intervalType` LowCardinality(String), + `distanceToTSS` UInt32, + `score` Float64, + `resourceScore` Array(Tuple(name LowCardinality(String), value Float64)), + `datasourceId` LowCardinality(String), + `pmid` String, + `studyId` String, +) +engine = Log; +; \ No newline at end of file diff --git a/config/clickhouse/scripts/intervals.sql b/config/clickhouse/scripts/intervals.sql new file mode 100644 index 0000000..69bb5f5 --- /dev/null +++ b/config/clickhouse/scripts/intervals.sql @@ -0,0 +1,12 @@ +create database if not exists ot; + +create table if not exists ot.intervals engine = MergeTree () +order by + (chromosome, start, end) as ( + select + * + from + ot.intervals_log + ); + +drop table ot.intervals_log; diff --git a/config/config.yaml b/config/config.yaml index 97aeac1..6231c8b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -221,6 +221,7 @@ steps: - name: explode load all datasets foreach: - aotf + - intervals - literature - w2v - sentences diff --git a/config/datasets.yaml b/config/datasets.yaml index e3140bd..43b579a 100644 --- a/config/datasets.yaml +++ b/config/datasets.yaml @@ -336,6 +336,10 @@ clickhouse: table: associations_otf_log input_dir: view/association_otf postload_script: config/clickhouse/scripts/aotf.sql + intervals: + table: intervals_log + input_dir: output/intervals + postload_script: config/clickhouse/scripts/intervals.sql literature: table: literature_log input_dir: output/literature diff --git a/deployment/variables.tf b/deployment/variables.tf index 10bfa47..be5a388 100644 --- a/deployment/variables.tf +++ b/deployment/variables.tf @@ -87,7 +87,7 @@ variable "open_search_jvm_options" { variable "clickhouse_image_tag" { description = "Clickhouse image tag" type = string - default = "23.3.1.2823" + default = "25.6.3.116" } variable "data_location_source" { From 36f84e0a3148451984298f98d108a89a61e2d96f Mon Sep 17 00:00:00 2001 From: James Hayhurst Date: Mon, 21 Jul 2025 13:57:39 +0100 Subject: [PATCH 2/2] changed date overflow behavour to saturate --- config/clickhouse/users.d/users.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/clickhouse/users.d/users.xml b/config/clickhouse/users.d/users.xml index 1c6ec2e..ba30dea 100644 --- a/config/clickhouse/users.d/users.xml +++ b/config/clickhouse/users.d/users.xml @@ -9,6 +9,7 @@ 1 random 1048576 + saturate 30000000000