-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
2087 lines (1863 loc) · 113 KB
/
Makefile
File metadata and controls
2087 lines (1863 loc) · 113 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# MicroMediaParam Pipeline Makefile
#
# This Makefile reproduces the complete bioinformatics pipeline for processing
# microbial growth media composition data, from PDF downloads to final analysis.
#
# Pipeline stages:
# 1. Data acquisition (parse URLs, download PDFs/JSON)
# 2. Data conversion (PDFs to text, JSON to markdown)
# 3. Knowledge graph mapping (compounds to ChEBI/KEGG/PubChem)
# 4. Solution expansion (DSMZ solution: references to chemical components)
# 5. Compound matching and merging
# 6. Hydration normalization and deduplication
# 7. Property calculation (pH, salinity, ionic strength)
# 8. Final media summary generation
# Configuration variables
PYTHON := python
SCRIPTS_DIR := src/scripts
REQUIREMENTS := requirements.txt
VENV_DIR := venv
# Pipeline output directory structure
OUTPUT_DIR := pipeline_output
DATA_ACQUISITION_DIR := $(OUTPUT_DIR)/data_acquisition
DATA_CONVERSION_DIR := $(OUTPUT_DIR)/data_conversion
DB_MAPPING_DIR := $(OUTPUT_DIR)/db_mapping
KG_MAPPING_DIR := $(OUTPUT_DIR)/kg_mapping
SOLUTION_EXPANSION_DIR := $(OUTPUT_DIR)/solution_expansion
COMPOUND_MATCHING_DIR := $(OUTPUT_DIR)/compound_matching
OAK_CHEBI_DIR := $(OUTPUT_DIR)/oak_chebi
MERGE_MAPPINGS_DIR := $(OUTPUT_DIR)/merge_mappings
INGREDIENT_ENHANCEMENT_DIR := $(OUTPUT_DIR)/ingredient_enhancement
HYDRATE_NORMALIZATION_DIR := $(OUTPUT_DIR)/hydrate_normalization
PROPERTY_CALCULATION_DIR := $(OUTPUT_DIR)/property_calculation
MEDIA_SUMMARY_DIR := $(OUTPUT_DIR)/media_summary
# Pipeline input/output directories
MEDIA_PDFS_DIR := $(DATA_ACQUISITION_DIR)/media_pdfs
MEDIA_TEXTS_DIR := $(DATA_CONVERSION_DIR)/media_texts
MEDIA_COMPOSITIONS_DIR := $(DATA_CONVERSION_DIR)/media_compositions
MEDIA_PROPERTIES_DIR := $(PROPERTY_CALCULATION_DIR)/media_properties
# Key pipeline files
GROWTH_MEDIA_URLS := $(DATA_ACQUISITION_DIR)/growth_media_urls.txt
COMPOSITION_MAPPING := $(KG_MAPPING_DIR)/composition_kg_mapping.tsv
EXPANDED_MAPPING := $(SOLUTION_EXPANSION_DIR)/composition_kg_mapping_expanded_solutions.tsv
SOLUTION_EXPANSION_REPORT := $(SOLUTION_EXPANSION_DIR)/dsmz_solution_expansion_report.json
UNACCOUNTED_MATCHES := $(COMPOUND_MATCHING_DIR)/unaccounted_compound_matches.tsv
UNIFIED_MAPPINGS := $(MERGE_MAPPINGS_DIR)/unified_compound_mappings.tsv
HIGH_CONFIDENCE_MAPPINGS := $(MERGE_MAPPINGS_DIR)/high_confidence_compound_mappings.tsv
LOW_CONFIDENCE_MAPPINGS := $(MERGE_MAPPINGS_DIR)/low_confidence_compound_mappings.tsv
HIGH_CONFIDENCE_UPGRADED := $(MERGE_MAPPINGS_DIR)/high_confidence_compound_mappings_upgraded.tsv
HIGH_CONFIDENCE_FORMULA := $(MERGE_MAPPINGS_DIR)/high_confidence_compound_mappings_formula_enhanced.tsv
HIGH_CONFIDENCE_FINAL := $(MERGE_MAPPINGS_DIR)/high_confidence_compound_mappings_final.tsv
HIGH_CONFIDENCE_CURATED := $(MERGE_MAPPINGS_DIR)/high_confidence_compound_mappings_curated_upgraded.tsv
HIGH_CONFIDENCE_ENRICHED := $(MERGE_MAPPINGS_DIR)/high_confidence_compound_mappings_enriched.tsv
# Final output files (clean names)
COMPOUND_MAPPINGS := $(MERGE_MAPPINGS_DIR)/compound_mappings.tsv
COMPOUND_MAPPINGS_LOW := $(MERGE_MAPPINGS_DIR)/compound_mappings_low_confidence.tsv
UNMAPPED_COMPOUNDS := $(MERGE_MAPPINGS_DIR)/unmapped_compounds.tsv
INGREDIENT_ENHANCED_HIGH := $(INGREDIENT_ENHANCEMENT_DIR)/high_confidence_compound_mappings_ingredient_enhanced.tsv
INGREDIENT_ENHANCED_LOW := $(INGREDIENT_ENHANCEMENT_DIR)/low_confidence_compound_mappings_ingredient_enhanced.tsv
HIGH_CONFIDENCE_NORMALIZED := $(HYDRATE_NORMALIZATION_DIR)/high_confidence_compound_mappings_normalized.tsv
LOW_CONFIDENCE_NORMALIZED := $(HYDRATE_NORMALIZATION_DIR)/low_confidence_compound_mappings_normalized.tsv
MEDIA_SUMMARY := $(MEDIA_SUMMARY_DIR)/media_summary.tsv
MEDIA_COMPOSITION_TABLE := $(MEDIA_SUMMARY_DIR)/media_composition_table.tsv
MEDIA_COMPOSITION_EXPANDED := $(MEDIA_SUMMARY_DIR)/media_composition_expanded.tsv
COMPLEX_INGREDIENT_COMPOSITIONS := data/curated/complex_ingredients/complex_ingredient_compositions.yaml
MEDIADIVE_SOLUTIONS_YAML := data/curated/complex_ingredients/mediadive_solutions_additions.yaml
CHEMICAL_PROPERTIES := $(DB_MAPPING_DIR)/chemical_properties.tsv
UNMAPPED_COMPLEX_ANALYSIS := $(OUTPUT_DIR)/analysis/unmapped_complex_ingredients_priority.tsv
UNMAPPED_COMPLEX_REPORT := $(OUTPUT_DIR)/analysis/unmapped_complex_ingredients_report.txt
# External data files (from kg-microbe project)
KG_MICROBE_BASE := /Users/marcin/Documents/VIMSS/ontology/KG-Hub/KG-Microbe/kg-microbe
CHEBI_NODES_FILE := $(KG_MICROBE_BASE)/data/transformed/ontologies/chebi_nodes.tsv
MEDIADIVE_RAW_DIR := $(KG_MICROBE_BASE)/data/raw/mediadive
MEDIADIVE_SOLUTIONS_JSON := $(MEDIADIVE_RAW_DIR)/solutions.json
MEDIADIVE_MEDIA_JSON := $(MEDIADIVE_RAW_DIR)/media_detailed.json
MEDIADIVE_COMPOUNDS_JSON := $(MEDIADIVE_RAW_DIR)/compounds.json
# Log files
LOGS := *.log
# Default target
.DEFAULT_GOAL := all
# Colors for output
RED := \033[31m
GREEN := \033[32m
YELLOW := \033[33m
BLUE := \033[34m
NC := \033[0m # No Color
# Help target
.PHONY: help
help:
@echo "$(BLUE)MicroMediaParam Pipeline Makefile$(NC)"
@echo "================================"
@echo ""
@echo "$(GREEN)Main Pipeline Targets (Optimized Order):$(NC)"
@echo " $(YELLOW)all$(NC) - Run complete optimized pipeline from start to finish"
@echo " $(YELLOW)data-acquisition$(NC) - Step 1: Download media PDFs and JSON data"
@echo " $(YELLOW)data-conversion$(NC) - Step 2: Convert PDFs to text and JSON to markdown"
@echo " $(YELLOW)db-mapping$(NC) - Step 3: Download IUPAC/PubChem data & build DB (ingredient → pKa, properties)"
@echo " $(YELLOW)kg-mapping-initial$(NC) - Step 4: Initial KG mapping (ingredient → ChEBI/KEGG IDs)"
@echo " $(YELLOW)solution-expansion$(NC) - Step 5: ✨ Expand DSMZ solution: references to individual chemical components"
@echo " $(YELLOW)normalize-hydration-early$(NC) - Step 6: 🔥 EARLY hydrate normalization for consistent base compounds"
@echo " $(YELLOW)enhance-ingredients-early$(NC) - Step 7: 🔥 EARLY ingredient: → ChEBI matching with normalized compounds"
@echo " $(YELLOW)kg-compound-matching$(NC) - Step 8: Enhanced compound matching using normalized base compounds"
@echo " $(YELLOW)kg-oak-chebi-mapping$(NC) - Step 9: OAK CHEBI annotations with improved compound set"
@echo " $(YELLOW)kg-merge-mappings$(NC) - Step 10: Merge all mapping sources with consistent hydration"
@echo " $(YELLOW)kg-enhance-all$(NC) - Step 10.5: 🚀 Enhance mappings (CAS→ChEBI, formula, microbio) +16% coverage!"
@echo " $(YELLOW)compute-properties$(NC) - Step 11: Calculate pH, salinity with enhanced mappings (72% coverage)"
@echo " $(YELLOW)media-summary$(NC) - Step 12: Generate final media summary table"
@echo ""
@echo "$(GREEN)Mapping Strategy Overview:$(NC)"
@echo " $(YELLOW)DB Mapping$(NC) (ingredient → pKa, properties): Downloads IUPAC/PubChem data, maximizes pKa coverage"
@echo " $(YELLOW)KG Mapping$(NC) (ingredient → ChEBI/KEGG IDs): Maximizes ingredients with knowledge graph IDs"
@echo " $(YELLOW)Goal$(NC): DB mappings enable pH/salinity calculations, KG mappings enable semantic analysis"
@echo ""
@echo "$(GREEN)Chemical Database Targets (IUPAC):$(NC)"
@echo " $(YELLOW)iupac-full-pipeline$(NC) - Complete IUPAC pipeline: analyze → download → process → generate"
@echo " $(YELLOW)iupac-update-from-mappings$(NC) - Update database from existing compound mappings"
@echo " $(YELLOW)iupac-process-composition-mapping$(NC) - Process all compounds from composition_kg_mapping.tsv"
@echo " $(YELLOW)iupac-add-compounds$(NC) - Add specific compounds (use COMPOUNDS='list')"
@echo " $(YELLOW)iupac-test$(NC) - Test IUPAC system with sample compounds"
@echo ""
@echo "$(GREEN)Chemical Database Targets (PubChem):$(NC)"
@echo " $(YELLOW)pubchem-full-pipeline$(NC) - Complete PubChem pipeline with bulk FTP downloads"
@echo " $(YELLOW)pubchem-process-composition-mapping$(NC) - Process all compounds from composition_kg_mapping.tsv"
@echo " $(YELLOW)pubchem-download-compounds$(NC) - Download specific compounds (use COMPOUNDS='list')"
@echo " $(YELLOW)pubchem-test$(NC) - Test PubChem system with sample compounds"
@echo ""
@echo "$(GREEN)OAK CHEBI Mapping Targets:$(NC)"
@echo " $(YELLOW)oak-chebi-mapping$(NC) - Complete pipeline: extract compounds → OAK annotate → apply mappings → fix hydration"
@echo " $(YELLOW)extract-non-chebi-compounds$(NC) - Extract compounds needing CHEBI mapping (342 compounds)"
@echo " $(YELLOW)oak-chebi-annotate$(NC) - Run OAK annotation against CHEBI ontology"
@echo " $(YELLOW)apply-oak-chebi-mappings$(NC) - Apply OAK results to composition mapping"
@echo " $(YELLOW)fix-hydrated-mappings$(NC) - Fix hydrated compounds mapped to ingredient codes"
@echo " $(YELLOW)oak-chebi-test$(NC) - Test OAK connection with sample compounds"
@echo " $(YELLOW)oak-chebi-status$(NC) - Show OAK CHEBI mapping status"
@echo " $(YELLOW)oak-chebi-clean$(NC) - Clean OAK CHEBI mapping files"
@echo ""
@echo "$(GREEN)BacDive Metabolites Mapping:$(NC)"
@echo " $(YELLOW)bacdive-metabolites-mapping$(NC) - Complete pipeline: extract → OAK annotate → apply mappings"
@echo " $(YELLOW)bacdive-metabolites-extract$(NC) - Extract 154 unique metabolites from 19,129 records"
@echo " $(YELLOW)bacdive-metabolites-status$(NC) - Show BacDive metabolites mapping status"
@echo " $(YELLOW)bacdive-metabolites-clean$(NC) - Clean BacDive metabolites files"
@echo ""
@echo "$(GREEN)Unmapped Compounds Analysis:$(NC)"
@echo " $(YELLOW)unmapped-full-pipeline$(NC) - Complete pipeline: analyze → map → integrate (+749 mappings)"
@echo " $(YELLOW)unmapped-analysis$(NC) - Extract clean unmapped compounds from all sources"
@echo " $(YELLOW)unmapped-map$(NC) - Map unmapped compounds using curated dictionary (~44% mapped)"
@echo " $(YELLOW)unmapped-integrate$(NC) - Integrate new mappings back into high-confidence file"
@echo " $(YELLOW)unmapped-status$(NC) - Show unmapped compounds summary"
@echo " $(YELLOW)unmapped-clean$(NC) - Clean unmapped analysis files"
@echo ""
@echo "$(GREEN)Compound Mapping Validation:$(NC)"
@echo " $(YELLOW)validate-semantic$(NC) - Validate mappings for semantic correctness (blocklists, units)"
@echo " $(YELLOW)semantic-validation-summary$(NC) - Show semantic validation summary"
@echo " $(YELLOW)validate-compound-mappings$(NC) - Validate ChEBI/PubChem IDs against official APIs"
@echo " $(YELLOW)validate-compound-mappings-quick$(NC) - Quick validation with 50 random samples"
@echo " $(YELLOW)remediate-compound-mappings$(NC) - Fix incorrect ChEBI IDs using PubChem lookup"
@echo " $(YELLOW)merge-verified-mappings$(NC) - Merge verified and remediated mappings"
@echo " $(YELLOW)validate-full-pipeline$(NC) - Run complete validation→remediation→merge workflow"
@echo " $(YELLOW)validate-status$(NC) - Show validation report summary"
@echo " $(YELLOW)validate-clean$(NC) - Clean validation files"
@echo ""
@echo "$(GREEN)Deterministic API Mapping (replaces LLM mappings):$(NC)"
@echo " $(YELLOW)api-mapping-full-pipeline$(NC) - 🔥 Full pipeline: extract → API lookup → validate (30-60 min)"
@echo " $(YELLOW)extract-all-compounds$(NC) - Extract all compound names from pipeline"
@echo " $(YELLOW)generate-api-mappings$(NC) - Generate mappings via PubChem/ChEBI APIs"
@echo " $(YELLOW)resume-api-mappings$(NC) - Resume from checkpoint (for long runs)"
@echo " $(YELLOW)validate-api-mappings$(NC) - Show API mapping statistics"
@echo " $(YELLOW)api-mapping-status$(NC) - Show API mapping status"
@echo " $(YELLOW)api-mapping-clean$(NC) - Clean API mapping files"
@echo ""
@echo "$(GREEN)IUPAC Pipeline Steps:$(NC)"
@echo " $(YELLOW)iupac-analyze-compounds$(NC) - Analyze existing data for download targets"
@echo " $(YELLOW)iupac-download-data$(NC) - Download chemical data from IUPAC sources"
@echo " $(YELLOW)iupac-process-data$(NC) - Process raw data into chemical properties"
@echo " $(YELLOW)iupac-generate-tsv$(NC) - Generate chemical_properties.tsv file"
@echo ""
@echo "$(GREEN)IUPAC Utilities:$(NC)"
@echo " $(YELLOW)iupac-status$(NC) - Show IUPAC data status and statistics"
@echo " $(YELLOW)iupac-validate-tsv$(NC) - Validate chemical_properties.tsv format"
@echo " $(YELLOW)iupac-clean$(NC) - Clean IUPAC data files"
@echo " $(YELLOW)iupac-restore-backup$(NC) - Restore chemical_properties.tsv from backup"
@echo ""
@echo "$(GREEN)Setup Targets:$(NC)"
@echo " $(YELLOW)install$(NC) - Install Python dependencies"
@echo " $(YELLOW)install-dev$(NC) - Install development dependencies"
@echo " $(YELLOW)setup-venv$(NC) - Create Python virtual environment"
@echo ""
@echo "$(GREEN)Quality Assurance:$(NC)"
@echo " $(YELLOW)test$(NC) - Run all tests"
@echo " $(YELLOW)lint$(NC) - Run code quality checks"
@echo " $(YELLOW)format$(NC) - Format code with black and isort"
@echo ""
@echo "$(GREEN)Maintenance:$(NC)"
@echo " $(YELLOW)clean$(NC) - Remove generated files and logs"
@echo " $(YELLOW)clean-all$(NC) - Remove all generated data and outputs"
@echo " $(YELLOW)status$(NC) - Show pipeline status and file counts"
@echo ""
@echo "$(GREEN)Usage Examples:$(NC)"
@echo " make install # Install dependencies"
@echo " make all # Run complete pipeline"
@echo " make data-acquisition # Just download media data"
@echo " make clean && make all # Clean rebuild"
# Complete pipeline
.PHONY: all
all: install data-acquisition data-conversion db-mapping kg-mapping-initial solution-expansion normalize-hydration-early enhance-ingredients-early kg-compound-matching kg-oak-chebi-mapping kg-merge-mappings kg-enhance-all extract-upstream-ingredients map-unmapped-ingredients merge-additional-mappings create-hydrate-mappings create-simplified-mappings map-biological-ingredients-foodon compute-properties media-summary import-mediadive-solutions expand-complex-ingredients analyze-unmapped-complex
@echo "$(GREEN)════════════════════════════════════════════════════════════════$(NC)"
@echo "$(GREEN) 🎉 COMPLETE PIPELINE FINISHED SUCCESSFULLY! 🎉 $(NC)"
@echo "$(GREEN)════════════════════════════════════════════════════════════════$(NC)"
@echo ""
@echo "$(BLUE)Pipeline stages completed:$(NC)"
@echo " ✓ Data acquisition from MediaDive/DSMZ"
@echo " ✓ PDF/JSON conversion to structured formats"
@echo " ✓ Chemical properties database building"
@echo " ✓ Initial KG mapping to ChEBI"
@echo " ✓ DSMZ solution expansion"
@echo " ✓ OAK ChEBI ontology-based mapping"
@echo " ✓ Unified mapping merge with confidence filtering"
@echo " ✓ CAS-to-ChEBI upgrade (+9% coverage)"
@echo " ✓ Formula matching for hydrates (+5% coverage)"
@echo " ✓ Microbiology products mapping (+2% coverage)"
@echo " ✓ Multi-ontology mapping (UBERON, FOODON, ENVO)"
@echo " ✓ Hydrate-specific compound mappings generation"
@echo " ✓ Biological ingredients FOODON/ENVO mapping via OAK (64% coverage, deterministic)"
@echo " ✓ Media property calculations (pH, salinity)"
@echo " ✓ Comprehensive media summary generation"
@echo " ✓ MediaDive solutions import (70 trace element/vitamin solutions from kg-microbe)"
@echo " ✓ Complex ingredients expansion (recursive: yeast extract, peptone, etc.)"
@echo " ✓ Unmapped complex ingredients analysis (prioritization for curation)"
@echo ""
@echo "$(GREEN)Final ChEBI coverage: 72% (improved from 56% baseline)$(NC)"
@echo ""
@echo "$(BLUE)Output files:$(NC)"
@echo " 📄 Enhanced mappings: $(HIGH_CONFIDENCE_FINAL)"
@echo " 📄 Hydrate mappings: $(COMPOUND_MAPPINGS_STRICT_HYDRATE)"
@echo " 📄 Simplified mappings: $(COMPOUND_MAPPINGS_SIMPLIFIED)"
@echo " 📄 Simplified hydrate mappings: $(COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE)"
@echo " 📄 Chemicals-only mappings: $(COMPOUND_MAPPINGS_CHEMICALS_ONLY)"
@echo " 📄 Chemicals-only hydrate mappings: $(COMPOUND_MAPPINGS_CHEMICALS_ONLY_HYDRATE)"
@echo " 📄 Biological FOODON mappings: $(BIOLOGICAL_INGREDIENTS_FOODON)"
@echo " 📄 Media properties: $(MEDIA_PROPERTIES_DIR)"
@echo " 📄 Media summary: $(MEDIA_SUMMARY)"
@echo " 📄 Unmapped complex ingredients report: $(UNMAPPED_COMPLEX_REPORT)"
# Create output directories
.PHONY: create-output-dirs
create-output-dirs:
@mkdir -p $(DATA_ACQUISITION_DIR) $(DATA_CONVERSION_DIR) $(DB_MAPPING_DIR) $(KG_MAPPING_DIR) $(SOLUTION_EXPANSION_DIR)
@mkdir -p $(COMPOUND_MATCHING_DIR) $(OAK_CHEBI_DIR) $(MERGE_MAPPINGS_DIR) $(INGREDIENT_ENHANCEMENT_DIR)
@mkdir -p $(HYDRATE_NORMALIZATION_DIR) $(PROPERTY_CALCULATION_DIR) $(MEDIA_SUMMARY_DIR)
@mkdir -p $(MEDIA_PDFS_DIR) $(MEDIA_TEXTS_DIR) $(MEDIA_COMPOSITIONS_DIR) $(MEDIA_PROPERTIES_DIR)
# Pipeline stage targets
# Stage 1: Data Acquisition
.PHONY: data-acquisition
data-acquisition: create-output-dirs $(GROWTH_MEDIA_URLS) $(MEDIA_PDFS_DIR)/.done
@echo "$(GREEN)✓ Data acquisition completed$(NC)"
# Parse media URLs from JSON files
$(GROWTH_MEDIA_URLS):
@echo "$(BLUE)Parsing media URLs from source files...$(NC)"
$(PYTHON) $(SCRIPTS_DIR)/parse_media_urls.py
# Download PDFs and JSON data
$(MEDIA_PDFS_DIR)/.done: $(GROWTH_MEDIA_URLS)
@echo "$(BLUE)Downloading media PDFs and JSON data...$(NC)"
$(PYTHON) $(SCRIPTS_DIR)/download_media_pdfs.py
@mkdir -p $(MEDIA_PDFS_DIR) && touch $(MEDIA_PDFS_DIR)/.done
# Stage 2: Data Conversion
.PHONY: data-conversion
data-conversion: $(MEDIA_TEXTS_DIR)/.done $(MEDIA_COMPOSITIONS_DIR)/.done
@echo "$(GREEN)✓ Data conversion completed$(NC)"
# Convert PDFs to text/markdown
$(MEDIA_TEXTS_DIR)/.done: $(MEDIA_PDFS_DIR)/.done
@echo "$(BLUE)Converting PDFs to text format...$(NC)"
$(PYTHON) $(SCRIPTS_DIR)/convert_pdfs_to_text.py
@mkdir -p $(MEDIA_TEXTS_DIR) && touch $(MEDIA_TEXTS_DIR)/.done
# Extract ALL compositions using enhanced multi-format extraction (including JCM HTML parsing)
$(MEDIA_COMPOSITIONS_DIR)/.done: $(MEDIA_TEXTS_DIR)/.done
@echo "$(BLUE)Extracting ALL chemical compositions using enhanced multi-format approach...$(NC)"
@echo "$(YELLOW)Goal: Extract from DSMZ JSON + JCM HTML + PDFs using specialized parsers$(NC)"
@echo "$(YELLOW)✨ NEW: JCM HTML parsing added for 1,313+ additional media$(NC)"
$(PYTHON) $(SCRIPTS_DIR)/extract_all_compositions_enhanced.py --input-dir media_pdfs --output-dir $(MEDIA_COMPOSITIONS_DIR)
@mkdir -p $(MEDIA_COMPOSITIONS_DIR) && touch $(MEDIA_COMPOSITIONS_DIR)/.done
# Stage 3: DB Mapping - Download IUPAC/PubChem & Build Chemical Properties Database (ingredient → pKa, properties)
.PHONY: db-mapping chemical-databases
db-mapping chemical-databases: $(CHEMICAL_PROPERTIES)
@echo "$(GREEN)✓ DB mapping completed: IUPAC/PubChem downloaded, ingredient → chemical properties$(NC)"
# Download chemical data from IUPAC and PubChem sources and build properties database (maximize ingredients with pKa values)
$(CHEMICAL_PROPERTIES): $(HIGH_CONFIDENCE_MAPPINGS)
@echo "$(BLUE)DB Mapping: Building ingredient → chemical properties database...$(NC)"
@echo "$(YELLOW)Goal: Maximize ingredients with pKa and molecular properties$(NC)"
@COMPOUND_COUNT=$$(tail -n +2 $(HIGH_CONFIDENCE_MAPPINGS) | cut -f2 | sort -u | wc -l | tr -d ' '); \
echo "$(YELLOW)Found $$COMPOUND_COUNT unique compounds from high-confidence mappings$(NC)"
@echo "$(YELLOW)Phase 1: DOWNLOADING PubChem chemical data for all compounds...$(NC)"
@echo "$(YELLOW)This may take 15-30 minutes depending on network speed and API rate limits$(NC)"
$(PYTHON) -m src.chem.pubchem.pipeline --from-mapping-file $(HIGH_CONFIDENCE_MAPPINGS) --data-dir $(PUBCHEM_DATA_DIR) --output-file $(CHEMICAL_PROPERTIES) || echo "$(YELLOW)PubChem download/processing completed with warnings$(NC)"
@echo "$(GREEN)✓ DB mapping database ready: $(CHEMICAL_PROPERTIES)$(NC)"
# Stage 4: Initial KG Mapping - Knowledge Graph Mapping (ingredient → ChEBI/KEGG/PubChem IDs)
.PHONY: kg-mapping-initial kg-mapping mapping
kg-mapping-initial kg-mapping mapping: create-output-dirs $(COMPOSITION_MAPPING)
@echo "$(GREEN)✓ Initial KG mapping completed: ingredient → knowledge graph IDs$(NC)"
# Specific target for the main mapping script
.PHONY: map-compositions-to-kg
map-compositions-to-kg: $(COMPOSITION_MAPPING)
@echo "$(GREEN)✓ Composition to KG mapping completed$(NC)"
# Map ingredients to KG entities (maximize ChEBI coverage)
$(COMPOSITION_MAPPING): $(MEDIA_COMPOSITIONS_DIR)/.done
@echo "$(BLUE)KG Mapping: ingredient → ChEBI/KEGG/PubChem IDs...$(NC)"
@echo "$(YELLOW)Goal: Maximize ingredients mapped to ChEBI$(NC)"
$(PYTHON) $(SCRIPTS_DIR)/map_compositions_to_kg.py --output-dir $(KG_MAPPING_DIR)
# Stage 5: Solution Expansion - Expand DSMZ solution: references to individual chemical components
.PHONY: solution-expansion
solution-expansion: $(EXPANDED_MAPPING)
@echo "$(GREEN)✓ Solution expansion completed: DSMZ solution: references expanded to chemical components$(NC)"
# Complete DSMZ solution expansion workflow
$(EXPANDED_MAPPING): $(COMPOSITION_MAPPING) | $(SOLUTION_EXPANSION_DIR)
@echo "$(BLUE)Solution Expansion: Expanding DSMZ solution: references...$(NC)"
@echo "$(YELLOW)Goal: Convert solution:241 → individual chemical components from DSMZ PDFs$(NC)"
cd $(SOLUTION_EXPANSION_DIR) && \
$(PYTHON) ../../src/tools/complete_solution_expansion.py \
--input ../../$(COMPOSITION_MAPPING) \
--output $(notdir $(EXPANDED_MAPPING))
mv $(SOLUTION_EXPANSION_DIR)/dsmz_solution_expansion_report.json $(SOLUTION_EXPANSION_REPORT)
# Create solution expansion output directory
$(SOLUTION_EXPANSION_DIR):
@mkdir -p $@
# Stage 6: EARLY Hydration Normalization - Fix hydrate inconsistencies BEFORE advanced matching
.PHONY: normalize-hydration-early
normalize-hydration-early: $(KG_MAPPING_DIR)/composition_kg_mapping_hydrate_normalized.tsv
@echo "$(GREEN)✓ EARLY hydration normalization completed: consistent base compounds for all downstream steps$(NC)"
# Apply enhanced hydrate normalization to expanded mapping (critical optimization)
$(KG_MAPPING_DIR)/composition_kg_mapping_hydrate_normalized.tsv: $(EXPANDED_MAPPING)
@echo "$(BLUE)🔥 EARLY Hydration Normalization: Fixing hydrate inconsistencies BEFORE advanced matching...$(NC)"
@echo "$(YELLOW)CRITICAL: This normalizes CaCl2 x 2 H2O & CaCl2 x 6 H2O → same base ChEBI but correct MW$(NC)"
$(PYTHON) src/hydration/normalize_hydration_enhanced.py --input-high $(EXPANDED_MAPPING) --output-suffix _hydrate_normalized
# Stage 7: EARLY Ingredient Enhancement - Convert ingredient: codes AFTER hydrate normalization
.PHONY: enhance-ingredients-early
enhance-ingredients-early: $(KG_MAPPING_DIR)/composition_kg_mapping_ingredient_enhanced.tsv
@echo "$(GREEN)✓ EARLY ingredient enhancement completed: ingredient: codes → ChEBI IDs with normalized compounds$(NC)"
# Apply ingredient enhancement to hydrate-normalized mapping (uses better base compounds)
$(KG_MAPPING_DIR)/composition_kg_mapping_ingredient_enhanced.tsv: $(KG_MAPPING_DIR)/composition_kg_mapping_hydrate_normalized.tsv
@echo "$(BLUE)🔥 EARLY Ingredient Enhancement: Converting ingredient: codes using normalized compounds...$(NC)"
@echo "$(YELLOW)ADVANTAGE: Works with hydrate-corrected base compounds for better ChEBI matching$(NC)"
$(PYTHON) src/mapping/enhance_ingredient_matching.py --input-high $(KG_MAPPING_DIR)/composition_kg_mapping_hydrate_normalized.tsv --output-suffix _ingredient_enhanced
@mv $(KG_MAPPING_DIR)/composition_kg_mapping_hydrate_normalized_ingredient_enhanced.tsv $(KG_MAPPING_DIR)/composition_kg_mapping_ingredient_enhanced.tsv
# Stage 8: Enhanced KG Compound Matching - Uses normalized base compounds for better matching
.PHONY: kg-compound-matching compound-matching
kg-compound-matching compound-matching: $(UNACCOUNTED_MATCHES)
@echo "$(GREEN)✓ Enhanced KG compound matching completed: additional ChEBI matches using normalized compounds$(NC)"
# Find ChEBI matches for ingredients using enhanced composition mapping (better base compounds)
$(UNACCOUNTED_MATCHES): $(KG_MAPPING_DIR)/composition_kg_mapping_ingredient_enhanced.tsv
@echo "$(BLUE)Enhanced KG Compound Matching: Finding ChEBI matches using normalized/enhanced compounds...$(NC)"
@echo "$(YELLOW)ADVANTAGE: Uses hydrate-normalized + ingredient-enhanced compounds for better matching$(NC)"
@echo "$(YELLOW)Note: Using enhanced composition mapping as input for better compound coverage$(NC)"
$(PYTHON) $(SCRIPTS_DIR)/find_unaccounted_compound_matches.py --output $(UNACCOUNTED_MATCHES)
# Stage 9: Enhanced KG OAK CHEBI Mapping - Advanced ChEBI mapping with normalized compounds
.PHONY: kg-oak-chebi-mapping oak-chebi-mapping
kg-oak-chebi-mapping oak-chebi-mapping: $(UPDATED_COMPOSITION_MAPPING)
@echo "$(GREEN)✓ Enhanced KG OAK CHEBI mapping completed: ontology annotations using normalized compounds$(NC)"
# Enhanced KG mapping with OAK CHEBI annotations using normalized/enhanced compounds (maximize ChEBI coverage)
$(UPDATED_COMPOSITION_MAPPING): $(UNACCOUNTED_MATCHES) $(KG_MAPPING_DIR)/composition_kg_mapping_ingredient_enhanced.tsv
@echo "$(BLUE)KG OAK CHEBI Mapping: ingredient → ChEBI with ontology annotations...$(NC)"
@echo "$(YELLOW)Goal: Maximize ChEBI coverage using ontology-based matching$(NC)"
@echo "$(YELLOW)Extracting ingredients needing CHEBI mapping...$(NC)"
$(PYTHON) src/analysis/extract_non_chebi_compounds.py || echo "$(YELLOW)Using existing compound list$(NC)"
@if [ -f "$(COMPOUNDS_FOR_CHEBI)" ] && [ -s "$(COMPOUNDS_FOR_CHEBI)" ]; then \
echo "$(YELLOW)Running OAK CHEBI annotation on $$(wc -l < $(COMPOUNDS_FOR_CHEBI)) ingredients...$(NC)"; \
echo "$(YELLOW)This may take 5-10 minutes to build the CHEBI lexical index...$(NC)"; \
runoak -i sqlite:obo:chebi annotate --text-file $(COMPOUNDS_FOR_CHEBI) --output-type json --lexical-index-file $(CHEBI_LEXICAL_INDEX) --output $(OAK_CHEBI_ANNOTATIONS) || echo "$(YELLOW)OAK annotation completed with warnings$(NC)"; \
echo "$(YELLOW)Applying OAK CHEBI mappings...$(NC)"; \
$(PYTHON) src/mapping/apply_oak_chebi_mappings.py --annotations-file $(OAK_CHEBI_ANNOTATIONS) --compounds-file $(COMPOUNDS_FOR_CHEBI) --output-file $(UPDATED_COMPOSITION_MAPPING) || cp $(COMPOSITION_MAPPING) $(UPDATED_COMPOSITION_MAPPING); \
echo "$(YELLOW)Fixing hydrated ingredient mappings...$(NC)"; \
$(PYTHON) src/hydration/fix_hydrated_compound_mappings.py || echo "$(YELLOW)Hydrated compound fixing completed with warnings$(NC)"; \
else \
echo "$(YELLOW)No ingredients need CHEBI mapping, using original composition mapping$(NC)"; \
cp $(COMPOSITION_MAPPING) $(UPDATED_COMPOSITION_MAPPING); \
fi
# Stage 10: Enhanced KG Merge Mappings - Consolidate all mapping sources with normalized compounds
.PHONY: kg-merge-mappings merge-mappings
kg-merge-mappings merge-mappings: $(UNIFIED_MAPPINGS) $(HIGH_CONFIDENCE_MAPPINGS) $(LOW_CONFIDENCE_MAPPINGS)
@echo "$(GREEN)✓ Enhanced KG mapping merge completed: unified ingredient → ChEBI mappings with consistent hydration$(NC)"
# Create unified KG mapping from enhanced + ChEBI matches
$(UNIFIED_MAPPINGS): $(UPDATED_COMPOSITION_MAPPING) $(UNACCOUNTED_MATCHES)
@echo "$(BLUE)KG Merge Mappings: Consolidating ingredient → ChEBI mappings...$(NC)"
@echo "$(YELLOW)Goal: Create unified high-quality ChEBI mappings$(NC)"
$(PYTHON) $(SCRIPTS_DIR)/merge_compound_mappings.py --composition-file $(UPDATED_COMPOSITION_MAPPING) --matches-file $(UNACCOUNTED_MATCHES) --output $(UNIFIED_MAPPINGS)
# Filter KG mappings by confidence level (high/low confidence ChEBI mappings)
$(HIGH_CONFIDENCE_MAPPINGS) $(LOW_CONFIDENCE_MAPPINGS): $(UNIFIED_MAPPINGS)
@echo "$(BLUE)Filtering KG mappings by confidence level...$(NC)"
$(PYTHON) src/mapping/filter_high_confidence_mappings.py --input $(UNIFIED_MAPPINGS) --output $(HIGH_CONFIDENCE_MAPPINGS) --low-confidence-output $(LOW_CONFIDENCE_MAPPINGS)
# ============================================================================
# Stage 10.5: Mapping Enhancements (CAS→ChEBI, Formula, Microbio)
# ============================================================================
# Stage 10.5a: CAS-to-ChEBI Upgrade
.PHONY: kg-enhance-cas-upgrade
kg-enhance-cas-upgrade: $(HIGH_CONFIDENCE_UPGRADED)
@echo "$(GREEN)✓ CAS-to-ChEBI upgrade completed$(NC)"
$(HIGH_CONFIDENCE_UPGRADED): $(HIGH_CONFIDENCE_MAPPINGS)
@echo "$(BLUE)Enhancing mappings: Upgrading CAS-RN → ChEBI...$(NC)"
@echo "$(YELLOW)Goal: Convert CAS Registry Numbers to ChEBI IDs for better semantic integration$(NC)"
$(PYTHON) src/mapping/cas_to_chebi_upgrader.py \
--chebi-file $(CHEBI_NODES_FILE) \
--input $(HIGH_CONFIDENCE_MAPPINGS) \
--output $(HIGH_CONFIDENCE_UPGRADED)
# Stage 10.5b: Formula Matching
.PHONY: kg-enhance-formula-matching
kg-enhance-formula-matching: $(HIGH_CONFIDENCE_FORMULA)
@echo "$(GREEN)✓ Formula matching completed$(NC)"
$(HIGH_CONFIDENCE_FORMULA): $(HIGH_CONFIDENCE_UPGRADED)
@echo "$(BLUE)Enhancing mappings: Matching hydrated chemical formulas...$(NC)"
@echo "$(YELLOW)Goal: Map hydrated compounds (e.g., 'CoCl2 x 6 H2O') to ChEBI$(NC)"
$(PYTHON) src/mapping/apply_formula_matching.py \
--chebi-file $(CHEBI_NODES_FILE) \
--input $(HIGH_CONFIDENCE_UPGRADED) \
--output $(HIGH_CONFIDENCE_FORMULA)
# Stage 10.5c: Microbiology Products Mapping
.PHONY: kg-enhance-microbio-products
kg-enhance-microbio-products: $(HIGH_CONFIDENCE_FINAL)
@echo "$(GREEN)✓ Microbiology products mapping completed$(NC)"
$(HIGH_CONFIDENCE_FINAL): $(HIGH_CONFIDENCE_FORMULA)
@echo "$(BLUE)Enhancing mappings: Applying microbiology products dictionary...$(NC)"
@echo "$(YELLOW)Goal: Map biological products (peptones, extracts) to ChEBI/UBERON$(NC)"
$(PYTHON) src/mapping/apply_microbio_products.py \
--input $(HIGH_CONFIDENCE_FORMULA) \
--output $(HIGH_CONFIDENCE_FINAL)
# Stage 10.5c2: Apply Curated Dictionary Upgrades
# Upgrades ingredient: IDs to proper ontology IDs using curated BIOLOGICAL_PRODUCTS dictionary
.PHONY: kg-enhance-curated-upgrades
kg-enhance-curated-upgrades: $(HIGH_CONFIDENCE_CURATED)
@echo "$(GREEN)✓ Curated dictionary upgrades completed$(NC)"
$(HIGH_CONFIDENCE_CURATED): $(HIGH_CONFIDENCE_FINAL)
@echo "$(BLUE)Enhancing mappings: Applying curated dictionary upgrades...$(NC)"
@echo "$(YELLOW)Goal: Upgrade ingredient: IDs to ChEBI/FOODON/UBERON using curated dictionary$(NC)"
$(PYTHON) -m src.mapping.apply_curated_upgrades \
--input $(HIGH_CONFIDENCE_FINAL) \
--output $(HIGH_CONFIDENCE_CURATED)
# Stage 10.5d: Enrich with ChEBI Labels and Formulas
.PHONY: kg-enrich-chebi
kg-enrich-chebi: $(HIGH_CONFIDENCE_ENRICHED)
@echo "$(GREEN)✓ ChEBI enrichment completed (labels + formulas)$(NC)"
CHEBI_FORMULAS_FILE := data/curated/chebi_formulas.tsv
$(HIGH_CONFIDENCE_ENRICHED): $(HIGH_CONFIDENCE_CURATED) $(CHEBI_NODES_FILE) $(CHEBI_FORMULAS_FILE)
@echo "$(BLUE)Enriching mappings: Adding ChEBI labels and molecular formulas...$(NC)"
@echo "$(YELLOW)Goal: Add chebi_label and chebi_formula columns for all CHEBI mappings$(NC)"
$(PYTHON) -m src.mapping.enrich_with_chebi_data \
--input $(HIGH_CONFIDENCE_CURATED) \
--chebi-nodes $(CHEBI_NODES_FILE) \
--chebi-formulas $(CHEBI_FORMULAS_FILE) \
--output $(HIGH_CONFIDENCE_ENRICHED)
# Stage 10.5e: Create Compound Name Lookup Table
# Many-to-1 mapping: each unique observed name → ChEBI ID (including hydrate variations)
COMPOUND_LOOKUP_TABLE := $(MERGE_MAPPINGS_DIR)/compound_name_lookup.tsv
.PHONY: kg-create-lookup-table
kg-create-lookup-table: $(COMPOUND_LOOKUP_TABLE)
@echo "$(GREEN)✓ Compound lookup table created$(NC)"
$(COMPOUND_LOOKUP_TABLE): $(HIGH_CONFIDENCE_ENRICHED) $(CHEBI_FORMULAS_FILE)
@echo "$(BLUE)Creating compound name lookup table...$(NC)"
@echo "$(YELLOW)Goal: Many-to-1 mapping with each observed name → parent compound$(NC)"
@echo "$(YELLOW)All hydrate forms map to same anhydrous parent ChEBI ID$(NC)"
$(PYTHON) -m src.mapping.create_compound_lookup_table \
--input $(HIGH_CONFIDENCE_ENRICHED) \
--chebi-formulas $(CHEBI_FORMULAS_FILE) \
--output $(COMPOUND_LOOKUP_TABLE)
# Stage 10.5: Complete Enhancement Pipeline
.PHONY: kg-enhance-all enhance-mappings
kg-enhance-all enhance-mappings: $(COMPOUND_LOOKUP_TABLE)
@echo "$(GREEN)✓ All mapping enhancements completed (including ChEBI labels/formulas + lookup table)$(NC)"
@echo "$(GREEN)Coverage improved from 56% → 72% (+16%)$(NC)"
@ENHANCED_CHEBI=$$(awk -F'\t' 'NR>1 && $$2 ~ /^CHEBI:/ {print $$1}' $(COMPOUND_LOOKUP_TABLE) 2>/dev/null | sort -u | wc -l | tr -d ' '); \
ENHANCED_UBERON=$$(awk -F'\t' 'NR>1 && $$2 ~ /^UBERON:/ {print $$1}' $(COMPOUND_LOOKUP_TABLE) 2>/dev/null | sort -u | wc -l | tr -d ' '); \
TOTAL_UNIQUE=$$(awk -F'\t' 'NR>1 {print $$1}' $(COMPOUND_LOOKUP_TABLE) 2>/dev/null | sort -u | wc -l | tr -d ' '); \
echo "$(GREEN)ChEBI: $$ENHANCED_CHEBI unique compounds, UBERON: $$ENHANCED_UBERON, Total: $$TOTAL_UNIQUE$(NC)"
# Stage 10.6: Finalize Mapping Files
# Creates clean final output files with simplified names and extracts unmapped compounds
.PHONY: finalize-mappings
finalize-mappings: $(COMPOUND_MAPPINGS) $(UNMAPPED_COMPOUNDS)
@echo "$(GREEN)✓ Final mapping files created$(NC)"
$(COMPOUND_MAPPINGS): $(HIGH_CONFIDENCE_ENRICHED)
@echo "$(BLUE)Creating final compound_mappings.tsv...$(NC)"
@cp $(HIGH_CONFIDENCE_ENRICHED) $(COMPOUND_MAPPINGS)
@cp $(LOW_CONFIDENCE_MAPPINGS) $(COMPOUND_MAPPINGS_LOW)
@echo "$(GREEN)✓ Created $(COMPOUND_MAPPINGS)$(NC)"
@echo "$(GREEN)✓ Created $(COMPOUND_MAPPINGS_LOW)$(NC)"
$(UNMAPPED_COMPOUNDS): $(COMPOUND_MAPPINGS)
@echo "$(BLUE)Extracting unmapped compounds...$(NC)"
@head -1 $(COMPOUND_MAPPINGS) > $(UNMAPPED_COMPOUNDS)
@awk -F'\t' 'NR>1 && $$3 ~ /^ingredient:/' $(COMPOUND_MAPPINGS) >> $(UNMAPPED_COMPOUNDS)
@UNMAPPED=$$(tail -n +2 $(UNMAPPED_COMPOUNDS) | wc -l | tr -d ' '); \
echo "$(YELLOW)Unmapped compounds: $$UNMAPPED$(NC)"
@echo "$(GREEN)✓ Created $(UNMAPPED_COMPOUNDS)$(NC)"
# ============================================================================
# Stage 10.6: Semantic Validation
# Validates mappings for semantic correctness (blocklisted ChEBI IDs,
# unit parsing errors, phosphate confusion, label mismatches)
# ============================================================================
VALIDATION_QUALITY_DIR := $(OUTPUT_DIR)/quality
SEMANTIC_VALIDATION_REPORT := $(VALIDATION_QUALITY_DIR)/mapping_validation_report.tsv
.PHONY: validate-semantic
validate-semantic: $(SEMANTIC_VALIDATION_REPORT)
@echo "$(GREEN)✓ Semantic validation completed$(NC)"
$(SEMANTIC_VALIDATION_REPORT): $(COMPOUND_MAPPINGS) | create-output-dirs
@mkdir -p $(VALIDATION_QUALITY_DIR)
@echo "$(BLUE)Running semantic validation of compound mappings...$(NC)"
$(PYTHON) -m src.quality.validate_mappings \
--input $(COMPOUND_MAPPINGS) \
--output $(SEMANTIC_VALIDATION_REPORT)
@echo ""
@echo "$(YELLOW)Critical issues to fix:$(NC)"
@if [ -f $(SEMANTIC_VALIDATION_REPORT) ]; then \
CRITICAL=$$(grep -c "critical" $(SEMANTIC_VALIDATION_REPORT) 2>/dev/null || echo "0"); \
WARNINGS=$$(grep -c "warning" $(SEMANTIC_VALIDATION_REPORT) 2>/dev/null || echo "0"); \
echo " Critical: $$CRITICAL"; \
echo " Warnings: $$WARNINGS"; \
fi
@echo "$(GREEN)Report: $(SEMANTIC_VALIDATION_REPORT)$(NC)"
.PHONY: semantic-validation-summary
semantic-validation-summary:
@echo "$(CYAN)=== Semantic Validation Summary ===$(NC)"
@if [ -f $(SEMANTIC_VALIDATION_REPORT) ]; then \
echo ""; \
echo "$(YELLOW)Issues by type:$(NC)"; \
cut -f5 $(SEMANTIC_VALIDATION_REPORT) | tail -n +2 | sort | uniq -c | sort -rn; \
echo ""; \
echo "$(YELLOW)Unique compounds with critical issues:$(NC)"; \
grep "critical" $(SEMANTIC_VALIDATION_REPORT) | cut -f2 | sort -u | head -10; \
else \
echo "$(RED)No validation report found. Run: make validate-semantic$(NC)"; \
fi
# Stage 10.7: Apply validation filter, CAS upgrade, and PubChem lookup to create strict mapping file
# All final mappings go into compound_mappings_strict.tsv
COMPOUND_MAPPINGS_STRICT_FILTERED := $(MERGE_MAPPINGS_DIR)/compound_mappings_strict_filtered.tsv
COMPOUND_MAPPINGS_STRICT_CAS := $(MERGE_MAPPINGS_DIR)/compound_mappings_strict_cas_upgraded.tsv
COMPOUND_MAPPINGS_STRICT := $(MERGE_MAPPINGS_DIR)/compound_mappings_strict.tsv
PUBCHEM_CACHE := data/cache/pubchem_name_cache.tsv
.PHONY: apply-validation-filter
apply-validation-filter: $(COMPOUND_MAPPINGS_STRICT)
@echo "$(GREEN)✓ Strict mappings complete (validation + CAS upgrade + PubChem)$(NC)"
# Step 1: Filter out bad mappings
$(COMPOUND_MAPPINGS_STRICT_FILTERED): $(COMPOUND_MAPPINGS) $(SEMANTIC_VALIDATION_REPORT)
@echo "$(BLUE)Step 1: Filtering out bad mappings...$(NC)"
$(PYTHON) -m src.quality.apply_validation_filter \
--mappings $(COMPOUND_MAPPINGS) \
--validation $(SEMANTIC_VALIDATION_REPORT) \
--output $(COMPOUND_MAPPINGS_STRICT_FILTERED)
# Step 2: Upgrade remaining CAS-RN to ChEBI where possible
$(COMPOUND_MAPPINGS_STRICT_CAS): $(COMPOUND_MAPPINGS_STRICT_FILTERED)
@echo "$(BLUE)Step 2: Upgrading CAS-RN → ChEBI...$(NC)"
$(PYTHON) src/mapping/cas_to_chebi_upgrader.py \
--chebi-file $(CHEBI_NODES_FILE) \
--input $(COMPOUND_MAPPINGS_STRICT_FILTERED) \
--output $(COMPOUND_MAPPINGS_STRICT_CAS)
# Step 3: PubChem lookup for remaining unmapped compounds (final step → compound_mappings_strict.tsv)
$(COMPOUND_MAPPINGS_STRICT): $(COMPOUND_MAPPINGS_STRICT_CAS)
@echo "$(BLUE)Step 3: Looking up remaining compounds in PubChem...$(NC)"
@echo "$(YELLOW)This may take a while for uncached compounds$(NC)"
$(PYTHON) -m src.mapping.pubchem_lookup \
--input $(COMPOUND_MAPPINGS_STRICT_CAS) \
--output $(COMPOUND_MAPPINGS_STRICT) \
--cache $(PUBCHEM_CACHE)
@echo ""
@echo "$(YELLOW)Final strict mapping summary:$(NC)"
@CHEBI=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT) | grep -c "^CHEBI:" || echo "0"); \
PUBCHEM=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT) | grep -c "^PubChem:" || echo "0"); \
CAS=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT) | grep -c "^CAS-RN:" || echo "0"); \
INGRED=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT) | grep -c "^ingredient:" || echo "0"); \
echo " ChEBI: $$CHEBI"; \
echo " PubChem: $$PUBCHEM"; \
echo " CAS-RN: $$CAS"; \
echo " ingredient: $$INGRED"
# Keep pubchem-lookup as alias for backwards compatibility
.PHONY: pubchem-lookup
pubchem-lookup: apply-validation-filter
# Move intermediate files to attic after finalization
.PHONY: cleanup-intermediates
cleanup-intermediates: finalize-mappings
@echo "$(BLUE)Moving intermediate files to attic...$(NC)"
@mkdir -p $(MERGE_MAPPINGS_DIR)/attic
@for f in $(HIGH_CONFIDENCE_MAPPINGS) $(HIGH_CONFIDENCE_UPGRADED) $(HIGH_CONFIDENCE_FORMULA) \
$(HIGH_CONFIDENCE_FINAL) $(HIGH_CONFIDENCE_CURATED) $(HIGH_CONFIDENCE_ENRICHED); do \
[ -f "$$f" ] && mv "$$f" $(MERGE_MAPPINGS_DIR)/attic/ 2>/dev/null || true; \
done
@echo "$(GREEN)✓ Intermediate files moved to attic$(NC)"
# ============================================================================
# Stage 10.5.5: Extract Upstream Ingredient Nodes
# Extracts mediadive.ingredient nodes from KG-Microbe's transformed nodes.tsv
# ============================================================================
# Upstream KG-Microbe mediadive nodes file
UPSTREAM_NODES := /Users/marcin/Documents/VIMSS/ontology/KG-Hub/KG-Microbe/kg-microbe/data/transformed/mediadive/nodes.tsv
UPSTREAM_INGREDIENTS := $(MERGE_MAPPINGS_DIR)/upstream_mediadive_ingredients.tsv
UPSTREAM_INGREDIENTS_ENHANCED := $(MERGE_MAPPINGS_DIR)/upstream_ingredients_formula_enhanced.tsv
UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED := $(MERGE_MAPPINGS_DIR)/upstream_ingredients_hydrate_enhanced.tsv
# Extract mediadive.ingredient nodes from upstream KG
.PHONY: extract-upstream-ingredients
extract-upstream-ingredients: $(UPSTREAM_INGREDIENTS)
@echo "$(GREEN)✓ Upstream ingredient extraction completed$(NC)"
$(UPSTREAM_INGREDIENTS): $(UPSTREAM_NODES) | create-output-dirs
@echo "$(BLUE)Extracting mediadive.ingredient nodes from upstream KG...$(NC)"
@echo "$(YELLOW)Source: $(UPSTREAM_NODES)$(NC)"
@grep "^mediadive.ingredient:" $(UPSTREAM_NODES) | cut -f1,3 > $(UPSTREAM_INGREDIENTS)
@echo "$(GREEN)Extracted $$(wc -l < $(UPSTREAM_INGREDIENTS) | tr -d ' ') ingredient nodes$(NC)"
# ============================================================================
# Stage 10.5c: Enhance Upstream Ingredients (PubChem/OLS + ChEBI Formula)
# 1. Run PubChem/OLS multi-ontology lookup for biological + chemical materials
# 2. Run ChEBI formula matching on remaining unmapped (especially hydrates)
# ============================================================================
# Cache files for API results (used by multiple stages)
CACHE_DIR := data/cache
OLS_CACHE_FILE := $(CACHE_DIR)/ols_multi_ontology_cache.tsv
PUBCHEM_CACHE_FILE := $(CACHE_DIR)/pubchem_lookup_cache.tsv
# Create cache directory
$(CACHE_DIR):
@mkdir -p $(CACHE_DIR)
# Intermediate file: after PubChem/OLS lookup
UPSTREAM_PUBCHEM_MAPPED := $(MERGE_MAPPINGS_DIR)/upstream_ingredients_pubchem_mapped.tsv
# Stage 10.5c.1: PubChem/OLS lookup on upstream ingredients
.PHONY: map-upstream-ingredients
map-upstream-ingredients: $(UPSTREAM_PUBCHEM_MAPPED)
@echo "$(GREEN)✓ Upstream ingredient PubChem/OLS mapping completed$(NC)"
$(UPSTREAM_PUBCHEM_MAPPED): $(UPSTREAM_INGREDIENTS) | $(CACHE_DIR)
@echo "$(BLUE)Mapping upstream ingredients with PubChem + OLS...$(NC)"
$(PYTHON) -m src.mapping.map_unmapped_ingredients \
--input $(UPSTREAM_INGREDIENTS) \
--output $(UPSTREAM_PUBCHEM_MAPPED) \
--ols-cache $(OLS_CACHE_FILE) \
--pubchem-cache $(PUBCHEM_CACHE_FILE)
# Stage 10.5c.2: ChEBI formula matching on remaining unmapped
.PHONY: enhance-upstream-ingredients
enhance-upstream-ingredients: $(UPSTREAM_INGREDIENTS_ENHANCED)
@echo "$(GREEN)✓ Upstream ingredient ChEBI enhancement completed$(NC)"
$(UPSTREAM_INGREDIENTS_ENHANCED): $(UPSTREAM_PUBCHEM_MAPPED) $(CHEBI_NODES_FILE)
@echo "$(BLUE)Enhancing upstream ingredients with ChEBI formula/name matching...$(NC)"
@echo "$(YELLOW)Using ChEBI: $(CHEBI_NODES_FILE)$(NC)"
@# Convert map_unmapped_ingredients output to apply_formula_matching input format
@# Input: original_id,original_name,normalized_name,mapped_id,mapped_label,formula,mapping_source,ingredient_type
@# Output: id,original,mapped
@awk -F'\t' 'NR==1 {print "id\toriginal\tmapped"} NR>1 && $$2!="" {print $$1"\t"$$2"\t"$$4}' $(UPSTREAM_PUBCHEM_MAPPED) > $(MERGE_MAPPINGS_DIR)/upstream_ingredients_for_chebi.tsv
$(PYTHON) src/mapping/apply_formula_matching.py \
--chebi-file $(CHEBI_NODES_FILE) \
--input $(MERGE_MAPPINGS_DIR)/upstream_ingredients_for_chebi.tsv \
--output $(UPSTREAM_INGREDIENTS_ENHANCED)
@TOTAL=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_ENHANCED) | wc -l | tr -d ' '); \
PUBCHEM=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_ENHANCED) | cut -f3 | grep -ic "PUBCHEM" || echo 0); \
CHEBI=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_ENHANCED) | cut -f3 | grep -c "CHEBI" || echo 0); \
FOODON=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_ENHANCED) | cut -f3 | grep -c "FOODON" || echo 0); \
UBERON=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_ENHANCED) | cut -f3 | grep -c "UBERON" || echo 0); \
ENVO=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_ENHANCED) | cut -f3 | grep -c "ENVO" || echo 0); \
MAPPED=$$((PUBCHEM + CHEBI + FOODON + UBERON + ENVO)); \
echo "$(GREEN)Mapped: $$MAPPED/$$TOTAL (PubChem=$$PUBCHEM ChEBI=$$CHEBI FOODON=$$FOODON UBERON=$$UBERON ENVO=$$ENVO)$(NC)"
# Stage 10.5c.3: Enhanced hydrate mapping
# Strips hydrate suffixes (x N H2O, pentahydrate, etc.) and maps base compounds
.PHONY: enhance-hydrates
enhance-hydrates: $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED)
@echo "$(GREEN)✓ Enhanced hydrate mapping completed$(NC)"
$(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED): $(UPSTREAM_INGREDIENTS_ENHANCED) $(CHEBI_NODES_FILE)
@echo "$(BLUE)Enhancing hydrate compound mappings...$(NC)"
@echo "$(YELLOW)Stripping hydrate suffixes and looking up base compounds$(NC)"
$(PYTHON) -m src.mapping.enhanced_hydrate_mapper \
--input $(UPSTREAM_INGREDIENTS_ENHANCED) \
--output $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED) \
--chebi-file $(CHEBI_NODES_FILE) \
--pubchem-cache $(PUBCHEM_CACHE_FILE)
@TOTAL=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED) | wc -l | tr -d ' '); \
CHEBI=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED) | cut -f3 | grep -c "CHEBI" || echo 0); \
PUBCHEM=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED) | cut -f3 | grep -ic "PUBCHEM" || echo 0); \
UNMAPPED=$$(tail -n +2 $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED) | cut -f3 | grep -c "^$$" || echo 0); \
echo "$(GREEN)After hydrate enhancement: ChEBI=$$CHEBI PubChem=$$PUBCHEM Unmapped=$$UNMAPPED$(NC)"
# Stage 10.5c.4: Apply upstream mappings to strict file
# Uses hydrate-enhanced upstream mappings to improve strict file coverage
COMPOUND_MAPPINGS_STRICT_UPSTREAM := $(MERGE_MAPPINGS_DIR)/compound_mappings_strict_upstream_enhanced.tsv
.PHONY: apply-upstream-to-strict
apply-upstream-to-strict: $(COMPOUND_MAPPINGS_STRICT_UPSTREAM)
@echo "$(GREEN)✓ Upstream mappings applied to strict file$(NC)"
$(COMPOUND_MAPPINGS_STRICT_UPSTREAM): $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED) $(COMPOUND_MAPPINGS_STRICT)
@echo "$(BLUE)Applying upstream hydrate-enhanced mappings to strict file...$(NC)"
$(PYTHON) -m src.mapping.apply_upstream_mappings \
--upstream $(UPSTREAM_INGREDIENTS_HYDRATE_ENHANCED) \
--strict $(COMPOUND_MAPPINGS_STRICT) \
--output $(COMPOUND_MAPPINGS_STRICT_UPSTREAM)
@CHEBI=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_UPSTREAM) | grep -c "^CHEBI:" || echo "0"); \
PUBCHEM=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_UPSTREAM) | grep -ic "PubChem" || echo "0"); \
TOTAL=$$(tail -n +2 $(COMPOUND_MAPPINGS_STRICT_UPSTREAM) | wc -l | tr -d ' '); \
MAPPED=$$((CHEBI + PUBCHEM)); \
echo "$(GREEN)Final strict coverage: $$MAPPED/$$TOTAL (ChEBI=$$CHEBI PubChem=$$PUBCHEM)$(NC)"
# Stage 10.5c.5: Finalize strict mappings
# Creates the canonical final strict mapping file with all enhancements applied
COMPOUND_MAPPINGS_STRICT_FINAL := $(MERGE_MAPPINGS_DIR)/compound_mappings_strict_final.tsv
.PHONY: finalize-strict
finalize-strict: $(COMPOUND_MAPPINGS_STRICT_FINAL)
@echo "$(GREEN)✓ Final strict mappings created$(NC)"
$(COMPOUND_MAPPINGS_STRICT_FINAL): $(COMPOUND_MAPPINGS_STRICT_UPSTREAM)
@echo "$(BLUE)Creating final strict mapping file...$(NC)"
@cp $(COMPOUND_MAPPINGS_STRICT_UPSTREAM) $(COMPOUND_MAPPINGS_STRICT_FINAL)
@echo ""
@echo "$(YELLOW)════════════════════════════════════════════════════════════$(NC)"
@echo "$(YELLOW) FINAL STRICT MAPPING SUMMARY$(NC)"
@echo "$(YELLOW)════════════════════════════════════════════════════════════$(NC)"
@TOTAL=$$(tail -n +2 $(COMPOUND_MAPPINGS_STRICT_FINAL) | wc -l | tr -d ' '); \
CHEBI=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep "^CHEBI:" | wc -l); \
PUBCHEM=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep -i "PubChem" | wc -l); \
FOODON=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep "^FOODON:" | wc -l); \
UBERON=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep "^UBERON:" | wc -l); \
ENVO=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep "^ENVO:" | wc -l); \
CAS=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep "^CAS-RN:" | wc -l); \
INGRED=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep "^ingredient:" | wc -l); \
UNMAPPED=$$(cut -f3 $(COMPOUND_MAPPINGS_STRICT_FINAL) | grep "^$$" | wc -l); \
SEMANTIC=$$((CHEBI + PUBCHEM + FOODON + UBERON + ENVO)); \
echo " Total entries: $$TOTAL"; \
echo ""; \
if [ "$$TOTAL" -gt 0 ]; then \
PERCENT=$$(awk "BEGIN {printf \"%.1f\", $$SEMANTIC * 100 / $$TOTAL}"); \
echo " Semantic IDs: $$SEMANTIC ($$PERCENT%)"; \
else \
echo " Semantic IDs: $$SEMANTIC (0.0%)"; \
fi; \
echo " ChEBI: $$CHEBI"; \
echo " PubChem: $$PUBCHEM"; \
echo " FOODON: $$FOODON"; \
echo " UBERON: $$UBERON"; \
echo " ENVO: $$ENVO"; \
echo ""; \
echo " Other:"; \
echo " CAS-RN: $$CAS"; \
echo " ingredient: $$INGRED"; \
echo " Unmapped: $$UNMAPPED"; \
echo "$(YELLOW)════════════════════════════════════════════════════════════$(NC)"
@echo ""
@echo "$(GREEN)Output: $(COMPOUND_MAPPINGS_STRICT_FINAL)$(NC)"
# Stage 10.5c.5.5: Create hydrate-specific mappings
# Generates variant with specific hydrated ChEBI IDs (e.g., CHEBI:86158 for CaCl2·2H2O)
# The base file maps all hydrates to anhydrous ChEBI IDs (degenerate mapping)
COMPOUND_MAPPINGS_STRICT_HYDRATE := $(MERGE_MAPPINGS_DIR)/compound_mappings_strict_final_hydrate.tsv
.PHONY: create-hydrate-mappings
create-hydrate-mappings: $(COMPOUND_MAPPINGS_STRICT_HYDRATE)
@echo "$(GREEN)✓ Hydrate-specific mappings created$(NC)"
$(COMPOUND_MAPPINGS_STRICT_HYDRATE): $(COMPOUND_MAPPINGS_STRICT_FINAL) $(CHEBI_FORMULAS_FILE)
@echo "$(BLUE)Creating hydrate-specific mapping file...$(NC)"
$(PYTHON) -m src.mapping.create_hydrate_mappings \
--input $(COMPOUND_MAPPINGS_STRICT_FINAL) \
--chebi-formulas $(CHEBI_FORMULAS_FILE) \
--output $(COMPOUND_MAPPINGS_STRICT_HYDRATE)
@echo "$(GREEN)Output: $(COMPOUND_MAPPINGS_STRICT_HYDRATE)$(NC)"
# Stage 10.5c.5.6: Create simplified mapping files
# Lightweight versions with just chemical name, formula, and identifiers
COMPOUND_MAPPINGS_SIMPLIFIED := $(MERGE_MAPPINGS_DIR)/compound_mappings_simplified.tsv
COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE := $(MERGE_MAPPINGS_DIR)/compound_mappings_simplified_hydrate.tsv
COMPOUND_MAPPINGS_CHEMICALS_ONLY := $(MERGE_MAPPINGS_DIR)/compound_mappings_chemicals_only.tsv
COMPOUND_MAPPINGS_CHEMICALS_ONLY_HYDRATE := $(MERGE_MAPPINGS_DIR)/compound_mappings_chemicals_only_hydrate.tsv
.PHONY: create-simplified-mappings
create-simplified-mappings: $(COMPOUND_MAPPINGS_SIMPLIFIED) $(COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE) $(COMPOUND_MAPPINGS_CHEMICALS_ONLY) $(COMPOUND_MAPPINGS_CHEMICALS_ONLY_HYDRATE)
@echo "$(GREEN)✓ Simplified mapping files created$(NC)"
$(COMPOUND_MAPPINGS_SIMPLIFIED) $(COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE): $(COMPOUND_MAPPINGS_STRICT_FINAL) $(COMPOUND_MAPPINGS_STRICT_HYDRATE)
@echo "$(BLUE)Creating simplified mapping files (all unique chemicals)...$(NC)"
@echo "$(YELLOW)Extracting: chemical name, formula, and identifiers$(NC)"
$(PYTHON) src/scripts/create_simplified_mappings.py \
--strict-input $(COMPOUND_MAPPINGS_STRICT_FINAL) \
--strict-output $(COMPOUND_MAPPINGS_SIMPLIFIED) \
--hydrate-input $(COMPOUND_MAPPINGS_STRICT_HYDRATE) \
--hydrate-output $(COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE)
@echo "$(GREEN)Outputs:$(NC)"
@echo " $(COMPOUND_MAPPINGS_SIMPLIFIED)"
@echo " $(COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE)"
$(COMPOUND_MAPPINGS_CHEMICALS_ONLY) $(COMPOUND_MAPPINGS_CHEMICALS_ONLY_HYDRATE): $(COMPOUND_MAPPINGS_SIMPLIFIED) $(COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE)
@echo "$(BLUE)Creating chemicals-only mapping files (no complex ingredients/media)...$(NC)"
@echo "$(YELLOW)Excluding: FOODON, medium, ingredient codes$(NC)"
$(PYTHON) src/scripts/create_chemicals_only_mappings.py \
--strict-input $(COMPOUND_MAPPINGS_SIMPLIFIED) \
--strict-output $(COMPOUND_MAPPINGS_CHEMICALS_ONLY) \
--hydrate-input $(COMPOUND_MAPPINGS_SIMPLIFIED_HYDRATE) \
--hydrate-output $(COMPOUND_MAPPINGS_CHEMICALS_ONLY_HYDRATE)
@echo "$(GREEN)Outputs:$(NC)"
@echo " $(COMPOUND_MAPPINGS_CHEMICALS_ONLY)"
@echo " $(COMPOUND_MAPPINGS_CHEMICALS_ONLY_HYDRATE)"
# Stage 10.5c.5.7: Map biological ingredients to FOODON/ENVO
# Uses OAK to map complex biological ingredients (extracts, peptones, broths) to FOODON ontology
# Preserves existing FOODON/ENVO IDs and adds new mappings deterministically
FOODON_MAPPING_DIR := pipeline_output/foodon_mapping
BIOLOGICAL_INGREDIENTS_FOODON := $(FOODON_MAPPING_DIR)/biological_ingredients_foodon_final.tsv
.PHONY: map-biological-ingredients-foodon
map-biological-ingredients-foodon: $(BIOLOGICAL_INGREDIENTS_FOODON)
@echo "$(GREEN)✓ Biological ingredients mapped to FOODON/ENVO$(NC)"
$(BIOLOGICAL_INGREDIENTS_FOODON): $(COMPOUND_MAPPINGS_STRICT_FINAL) | $(FOODON_MAPPING_DIR)
@echo "$(BLUE)Mapping biological ingredients to FOODON using OAK...$(NC)"
@echo "$(YELLOW)Enhanced search: exact, lowercase, normalized, synonyms, base compound$(NC)"
$(PYTHON) src/mapping/oak_foodon_mapper.py \
--input $(COMPOUND_MAPPINGS_STRICT_FINAL) \
--output $(BIOLOGICAL_INGREDIENTS_FOODON)
@if [ -f $(BIOLOGICAL_INGREDIENTS_FOODON) ]; then \
TOTAL=$$(tail -n +2 $(BIOLOGICAL_INGREDIENTS_FOODON) | wc -l | tr -d ' '); \
MAPPED=$$(tail -n +2 $(BIOLOGICAL_INGREDIENTS_FOODON) | cut -f2 | grep -v "^$$" | wc -l); \
PRESERVED=$$(grep -c "preserved" $(BIOLOGICAL_INGREDIENTS_FOODON) || echo 0); \
NEW_MAPPED=$$((MAPPED - PRESERVED)); \
echo ""; \
echo "$(YELLOW)FOODON Mapping Summary:$(NC)"; \
echo " Total biological ingredients: $$TOTAL"; \
echo " With FOODON/ENVO IDs: $$MAPPED ($$((MAPPED * 100 / TOTAL))%)"; \
echo " Preserved from current: $$PRESERVED"; \
echo " Newly mapped via OAK: $$NEW_MAPPED"; \
echo " Unable to map: $$((TOTAL - MAPPED))"; \
echo "$(GREEN)Output: $(BIOLOGICAL_INGREDIENTS_FOODON)$(NC)"; \
fi
$(FOODON_MAPPING_DIR):
@mkdir -p $(FOODON_MAPPING_DIR)
# Stage 10.5c.6: Validate ontology mappings
# Uses OAK or local ChEBI nodes to verify IDs exist in ontologies
ONTOLOGY_VALIDATION_REPORT := $(MERGE_MAPPINGS_DIR)/ontology_validation_report.tsv
.PHONY: validate-ontology-mappings
validate-ontology-mappings: $(ONTOLOGY_VALIDATION_REPORT)
@echo "$(GREEN)✓ Ontology mapping validation completed$(NC)"
$(ONTOLOGY_VALIDATION_REPORT): $(COMPOUND_MAPPINGS_STRICT_FINAL) $(CHEBI_NODES_FILE)
@echo "$(BLUE)Validating ontology mappings...$(NC)"
@echo "$(YELLOW)Checking ChEBI, UBERON, FOODON, ENVO IDs against ontologies$(NC)"
$(PYTHON) -m src.quality.validate_ontology_mappings \
--input $(COMPOUND_MAPPINGS_STRICT_FINAL) \
--output $(ONTOLOGY_VALIDATION_REPORT) \
--chebi-nodes $(CHEBI_NODES_FILE) || true
@if [ -f $(ONTOLOGY_VALIDATION_REPORT) ]; then \
TOTAL=$$(tail -n +2 $(ONTOLOGY_VALIDATION_REPORT) | wc -l | tr -d ' '); \
VALID=$$(grep -c "valid" $(ONTOLOGY_VALIDATION_REPORT) || echo 0); \
INVALID=$$(grep -c "invalid" $(ONTOLOGY_VALIDATION_REPORT) || echo 0); \
echo ""; \
echo "$(YELLOW)Validation Summary:$(NC)"; \
echo " Total IDs: $$TOTAL"; \
echo " Valid: $$VALID"; \
echo " Invalid: $$INVALID"; \
if [ "$$INVALID" -gt 0 ]; then \
echo "$(RED)⚠ Found $$INVALID invalid ontology IDs$(NC)"; \
else \
echo "$(GREEN)✓ All ontology IDs are valid$(NC)"; \
fi; \
fi
# Validate using OAK (slower but authoritative)
.PHONY: validate-ontology-mappings-oak
validate-ontology-mappings-oak: $(COMPOUND_MAPPINGS_STRICT_FINAL)
@echo "$(BLUE)Validating ontology mappings using OAK (this may take a while)...$(NC)"
$(PYTHON) -m src.quality.validate_ontology_mappings \
--input $(COMPOUND_MAPPINGS_STRICT_FINAL) \
--output $(ONTOLOGY_VALIDATION_REPORT) \
--use-oak \
--batch-size 50
# ============================================================================
# Stage 10.6: Map Unmapped Ingredients (OLS + PubChem)
# Uses multi-ontology search (UBERON, FOODON, ENVO) for biological materials
# Uses PubChem fallback for chemicals without ChEBI mappings
# ============================================================================
# kg-microbe unmapped ingredients file
KG_MICROBE_MAPPINGS := /Users/marcin/Documents/VIMSS/ontology/KG-Hub/KG-Microbe/kg-microbe/mappings
UNMAPPED_MEDIADIVE_FILE := $(KG_MICROBE_MAPPINGS)/unmapped_mediadive_ingredients.tsv
# Output files (CACHE_DIR, OLS_CACHE_FILE, PUBCHEM_CACHE_FILE defined in Stage 10.5c)
ADDITIONAL_MAPPINGS := $(MERGE_MAPPINGS_DIR)/additional_ingredient_mappings.tsv
EXTENDED_LOOKUP_TABLE := $(MERGE_MAPPINGS_DIR)/compound_name_lookup_extended.tsv
# Map unmapped ingredients from kg-microbe MediaDive analysis
.PHONY: map-unmapped-ingredients
map-unmapped-ingredients: $(ADDITIONAL_MAPPINGS)
@echo "$(GREEN)✓ Unmapped ingredients mapping completed$(NC)"
$(ADDITIONAL_MAPPINGS): $(UNMAPPED_MEDIADIVE_FILE) | $(CACHE_DIR)
@echo "$(BLUE)Mapping unmapped MediaDive ingredients...$(NC)"
@echo "$(YELLOW)Using OLS4 (UBERON, FOODON, ENVO) for biological materials$(NC)"
@echo "$(YELLOW)Using PubChem for chemical compounds$(NC)"
$(PYTHON) -m src.mapping.map_unmapped_ingredients \
--input $(UNMAPPED_MEDIADIVE_FILE) \
--output $(ADDITIONAL_MAPPINGS) \
--ols-cache $(OLS_CACHE_FILE) \
--pubchem-cache $(PUBCHEM_CACHE_FILE)
# Merge additional mappings into compound lookup table
.PHONY: merge-additional-mappings
merge-additional-mappings: $(EXTENDED_LOOKUP_TABLE)
@echo "$(GREEN)✓ Additional mappings merged into lookup table$(NC)"
$(EXTENDED_LOOKUP_TABLE): $(ADDITIONAL_MAPPINGS) $(COMPOUND_LOOKUP_TABLE)
@echo "$(BLUE)Merging additional mappings into compound lookup table...$(NC)"
$(PYTHON) -m src.mapping.merge_additional_mappings \
--lookup-table $(COMPOUND_LOOKUP_TABLE) \
--additional $(ADDITIONAL_MAPPINGS) \
--output $(EXTENDED_LOOKUP_TABLE)
# Run full unmapped ingredients mapping pipeline
.PHONY: extend-mappings
extend-mappings: merge-additional-mappings
@echo "$(GREEN)✓ Extended mappings pipeline completed$(NC)"
@EXTENDED_COUNT=$$(wc -l < $(EXTENDED_LOOKUP_TABLE) 2>/dev/null | tr -d ' '); \
ORIGINAL_COUNT=$$(wc -l < $(COMPOUND_LOOKUP_TABLE) 2>/dev/null | tr -d ' '); \
echo "$(GREEN)Extended lookup table: $$EXTENDED_COUNT entries (was $$ORIGINAL_COUNT)$(NC)"
# ============================================================================
# Stage 11: Property Calculation - Using enhanced mappings with hydration-corrected MW
# ============================================================================
.PHONY: compute-properties
compute-properties: $(MEDIA_PROPERTIES_DIR)/.done
@echo "$(GREEN)✓ Media properties calculation completed using expanded ingredients (97.6% ChEBI coverage)$(NC)"
# Calculate pH, salinity, ionic strength using expanded complex ingredients
$(MEDIA_PROPERTIES_DIR)/.done: $(MEDIA_COMPOSITION_EXPANDED) $(CHEMICAL_PROPERTIES)
@echo "$(BLUE)Property Calculation: Using expanded complex ingredients (97.6% ChEBI coverage)...$(NC)"
@echo "$(YELLOW)ADVANTAGE: Complex ingredients resolved to constituents (yeast extract → 34 chemicals)$(NC)"