Skip to content

Commit 2cc7d3a

Browse files
committed
Always recreate forum index, add logging, correctly get post ids
post ids were being returned as strings from solr so the code to reindex was deleting them because they didn't match the int from the database. Cast to int. However, we also remove this code because we now create a new index on demand - we should never be in a position where we try and reindex onto an existing index
1 parent 3989885 commit 2cc7d3a

File tree

2 files changed

+12
-26
lines changed

2 files changed

+12
-26
lines changed

search/management/commands/reindex_search_engine_forum.py

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
import json
2323
import logging
2424
import os
25+
import time
2526

2627
from django.core.management.base import BaseCommand
2728

2829
from forum.models import Post
2930
from search import solrapi
31+
from search.management.commands.post_dirty_sounds_to_search_engine import time_stats
3032
from utils.search import get_search_engine
3133
from utils.search.search_forum import add_posts_to_search_engine, get_all_post_ids_from_search_engine, \
3234
delete_all_posts_from_search_engine, delete_posts_from_search_engine
@@ -46,13 +48,6 @@ def add_arguments(self, parser):
4648
type=int,
4749
help='How many posts to add at once')
4850

49-
parser.add_argument(
50-
'--recreate-index',
51-
action='store_true',
52-
dest='recreate_index',
53-
default=False,
54-
help='Create a new index and index into it. Update the forum alias to point to this new index.')
55-
5651
def handle(self, *args, **options):
5752
search_engine = get_search_engine()
5853

@@ -65,30 +60,23 @@ def handle(self, *args, **options):
6560
collection_name = f"forum_{current_date}"
6661
new_collection_url = f"{search_engine.solr_base_url}/solr/{collection_name}"
6762
solr_api = solrapi.SolrManagementAPI(search_engine.solr_base_url, collection_name)
68-
recreate_index = options['recreate_index']
69-
70-
if recreate_index:
71-
solr_api.create_collection_and_schema(delete_default_fields_definition, forum_schema_definition, "thread_id")
63+
solr_api.create_collection_and_schema(delete_default_fields_definition, forum_schema_definition, "thread_id")
7264

7365
# Select all moderated forum posts and index them
7466
all_posts = Post.objects.select_related("thread", "author", "thread__author", "thread__forum")\
7567
.filter(moderation_state="OK")
7668
num_posts = len(all_posts)
7769
console_logger.info("Re-indexing %d forum posts", num_posts)
7870
slice_size = options['size_size']
71+
n_posts_indexed_correctly = 0
72+
starttime = time.monotonic()
7973
for i in range(0, num_posts, slice_size):
8074
post_ids_slice = all_posts[i:i + slice_size]
75+
n_posts_indexed = len(post_ids_slice)
8176
add_posts_to_search_engine(post_ids_slice, solr_collection_url=new_collection_url)
77+
n_posts_indexed_correctly += n_posts_indexed
78+
elapsed, remaining = time_stats(n_posts_indexed_correctly, num_posts, starttime)
79+
console_logger.info(f"Added {n_posts_indexed_correctly}/{num_posts} posts. Elapsed: {elapsed}, Remaining: {remaining}")
8280

83-
# Find all indexed forum posts which are not in the DB and remove them. This part of the code should do nothing
84-
# as deleted forum posts should be removed from the index in due time. In particular, if the "clear index" is
85-
# passed, this bit of code should remove no posts.
86-
indexed_post_ids = get_all_post_ids_from_search_engine(solr_collection_url=new_collection_url)
87-
post_ids_to_delete = list(set(indexed_post_ids).difference(all_posts.values_list('id', flat=True)))
88-
console_logger.info("Deleting %d non-existing posts from the search engine", len(post_ids_to_delete))
89-
if post_ids_to_delete:
90-
delete_posts_from_search_engine(post_ids_to_delete, solr_collection_url=new_collection_url)
91-
92-
if recreate_index:
93-
console_logger.info("Updating the forum alias to point to the new index")
94-
solr_api.create_collection_alias("forum")
81+
console_logger.info("Updating the forum alias to point to the new index")
82+
solr_api.create_collection_alias("forum")

utils/search/search_forum.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,10 @@ def add_posts_to_search_engine(post_objects, solr_collection_url=None):
3636
"""
3737
num_posts = len(post_objects)
3838
try:
39-
console_logger.info("Adding %d posts to search engine" % num_posts)
4039
search_logger.info("Adding %d posts to search engine" % num_posts)
4140
get_search_engine(forum_index_url=solr_collection_url).add_forum_posts_to_index(post_objects)
4241
return num_posts
4342
except SearchEngineException as e:
44-
console_logger.info(f"Failed to add posts to search engine index: {str(e)}")
4543
search_logger.info(f"Failed to add posts to search engine index: {str(e)}")
4644
return 0
4745

@@ -87,7 +85,7 @@ def get_all_post_ids_from_search_engine(solr_collection_url=None, page_size=2000
8785
while solr_count is None or len(solr_ids) < solr_count:
8886
response = search_engine.search_forum_posts(query_filter='*:*', group_by_thread=False,
8987
offset=(current_page - 1) * page_size, num_posts=page_size)
90-
solr_ids += [element['id'] for element in response.docs]
88+
solr_ids += [int(element['id']) for element in response.docs]
9189
solr_count = response.num_found
9290
current_page += 1
9391
except SearchEngineException as e:

0 commit comments

Comments
 (0)