dib-lab · ctb · Dec 26, 2024 · Dec 26, 2024 · Dec 27, 2024 · Dec 30, 2024
diff --git a/chill_filter_web/__init__.py b/chill_filter_web/__init__.py
@@ -19,6 +19,8 @@
 from .database_info import MOLTYPE, KSIZE, SCALED
 from .utils import *
 
+UNIQUE_INTERSECT_BP_THRESHOLD = 3*SCALED
+
 default_settings = dict(
     UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../chill-data'),
     EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "examples/"),
@@ -299,7 +301,7 @@ def sig_search(md5, filename):
     # read!
     try:
         gather_df = pd.read_csv(csv_filename)
-        gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
+        gather_df = gather_df[gather_df["unique_intersect_bp"] >= UNIQUE_INTERSECT_BP_THRESHOLD]
         gather_df = gather_df.sort_values(by='gather_result_rank')
     except:
         gather_df = []
@@ -375,7 +377,7 @@ def sig_subsearch(md5, filename, dbname):
     # read!
     try:
         gather_df = pd.read_csv(csv_filename)
-        gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
+        gather_df = gather_df[gather_df["unique_intersect_bp"] >= UNIQUE_INTERSECT_BP_THRESHOLD]
         gather_df = gather_df.sort_values(by='f_unique_weighted', ascending=False)
     except:
         gather_df = []

diff --git a/chill_filter_web/database_info.py b/chill_filter_web/database_info.py
@@ -57,7 +57,7 @@ def get_nextlevel_db(self, match_name):
                          'genbank-plants': 'all plants (GenBank July 2024)',
                          },
                         {
-                            'bacteria and archaea (GTDB rs220)': 'gtdb-rs220-phylum',
+                            'bacteria and archaea (GTDB rs220)': 'gtdb-rs220-genus',
                             'genbank-plants': 'genbank-plants-2024.07',
                         }, default=True),
     DatabaseDescription('gtdb-only',
@@ -75,6 +75,11 @@ def get_nextlevel_db(self, match_name):
                         '',
                         'all bacterial and archaeal phyla (GTDB rs220)',
                         {}, {}),
+    DatabaseDescription('gtdb-rs220-genus',
+                        'prepare-db/merged-genus.rocksdb',
+                        '',
+                        'all bacterial and archaeal genera (GTDB rs220)',
+                        {}, {}),
     DatabaseDescription('genbank-plants-2024.07',
                         'prepare-db/genbank-plants-2024.07.rocksdb',
                         '',

diff --git a/chill_filter_web/templates/faq.md b/chill_filter_web/templates/faq.md
@@ -30,6 +30,39 @@ delete your sketch at any time.  <p> For more information on
 sketching, as well as an over-abundance of technical details, please
 see [the sourmash documentation](https://sourmash.readthedocs.io/).
 
+## Should I subset my data to make things go faster?
+
+We highly recommend analyzing your entire data set - no filtering,
+subsetting, or removing reads. chill-filter's speed will not be
+affected. But of course you're welcome to try out different
+approaches - and we're happy to chat about it in
+[the issue tracker!](https://github.com/dib-lab/chill-filter/issues)
+
+## How can I do a combined analysis of several sequencing runs?
+
+For now, you either have to combine all your files into one FASTA/FASTQ
+file and sketch that, or use a command line tool to build a combined
+sketch. Here's the command line you can use with sourmash:
+```
+sourmash sketch dna -p k=51,scaled=100000 [ list of files] --name SampleName -o combined.sig.zip
+```
+although there are other, faster programs you can use (e.g. [manysketch](https://github.com/sourmash-bio/sourmash_plugin_branchwater/tree/main/doc)).
+
+You can then upload the resulting `combined.sig.zip` on the front page for a combined analysis!
+
+## How can I analyze a dozen different samples?
+
+If you want to use the Web site, you'll need to upload each sample separately.
+Sorry!
+
+There are ways to analyze hundreds to thousands of samples at the
+command line. This requires installing the sourmash software and 
+preparing some databases; ask us for details in
+[the issue tracker!](https://github.com/dib-lab/chill-filter/issues)
+
+We're working on a REST API so that you can use chill-filter from the
+command line without installing the databases locally, too.
+
 ## How can I see which specific microbial or plant genomes are in my sample?
 
 You can see matches to specific plant genomes, but not to specific

diff --git a/scripts/file-to-url.py b/scripts/file-to-url.py
@@ -0,0 +1,27 @@
+#! /usr/bin/env python
+import sourmash
+import argparse
+import sys
+import os
+
+
+from chill_filter_web import utils
+
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument('sketchfile')
+    args = p.parse_args()
+
+    ss = utils.load_sig(args.sketchfile)
+    assert ss is not None
+
+    filename = os.path.basename(args.sketchfile)
+    md5 = ss.md5sum()[:8]
+
+    url = f"/{md5}/{filename}"
+    print(url)
+
+
+if __name__ == '__main__':
+    sys.exit(main())