diff --git a/Gemfile.lock b/Gemfile.lock index 5835e1d9f..ea58a3302 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,8 +1,8 @@ GIT remote: https://github.com/boost/codeclimate_diff.git - revision: 8974e206e994dbd168a46de24faaabfea8503c8e + revision: ee2cece9fb5baffb8d9a367c2ffa41e1266a3c32 specs: - codeclimate_diff (0.1.13) + codeclimate_diff (0.1.14) colorize json optparse @@ -164,7 +164,7 @@ GEM debug_inspector (>= 1.2.0) bson (5.0.0) builder (3.3.0) - byebug (11.1.3) + byebug (12.0.0) case_transform (0.2) activesupport childprocess (5.0.0) @@ -221,7 +221,7 @@ GEM hash-deep-merge (0.1.1) htmlentities (4.3.4) http-accept (1.7.0) - http-cookie (1.0.7) + http-cookie (1.1.0) domain_name (~> 0.5) i18n (1.14.7) concurrent-ruby (~> 1.0) @@ -230,7 +230,7 @@ GEM pp (>= 0.6.0) rdoc (>= 4.0.0) reline (>= 0.4.2) - json (2.7.2) + json (2.16.0) json-canonicalization (1.0.0) json-ld (3.3.1) htmlentities (~> 4.3) @@ -278,10 +278,10 @@ GEM net-smtp marcel (1.0.4) method_source (1.1.0) - mime-types (3.6.0) + mime-types (3.7.0) logger - mime-types-data (~> 3.2015) - mime-types-data (3.2024.1001) + mime-types-data (~> 3.2025, >= 3.2025.0507) + mime-types-data (3.2025.0924) mini_mime (1.1.5) minitest (5.25.5) mongo (2.20.0) @@ -312,7 +312,7 @@ GEM racc (~> 1.4) nokogiri (1.18.9-x86_64-linux-gnu) racc (~> 1.4) - optparse (0.5.0) + optparse (0.8.0) orm_adapter (0.5.0) parallel (1.25.1) parser (3.3.4.0) @@ -323,12 +323,12 @@ GEM pr_geohash (1.0.0) prettyprint (0.2.0) progressbar (1.13.0) - pry (0.14.2) + pry (0.15.2) coderay (~> 1.1) method_source (~> 1.0) - pry-byebug (3.10.1) - byebug (~> 11.0) - pry (>= 0.13, < 0.15) + pry-byebug (3.11.0) + byebug (~> 12.0) + pry (>= 0.13, < 0.16) pry-rails (0.3.11) pry (>= 0.13.0) psych (5.2.6) diff --git a/app/models/supplejack_api/collection_metric.rb b/app/models/supplejack_api/collection_metric.rb index ae4fc8168..03d3beb97 100644 --- a/app/models/supplejack_api/collection_metric.rb +++ b/app/models/supplejack_api/collection_metric.rb @@ -6,6 +6,7 @@ class CollectionMetric include Mongoid::Document include Mongoid::Timestamps include SupplejackApi::Concerns::QueryableByDate + include SupplejackApi::Concerns::MetricHelpers field :d, as: :date, type: Date, default: Time.now.utc field :dc, as: :display_collection, type: String @@ -31,49 +32,56 @@ class CollectionMetric ) end - def self.spawn(date_range = (30.days.ago.utc..Time.zone.now.yesterday.beginning_of_day)) + def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginning_of_day)) return unless SupplejackApi.config.log_metrics == true - dates = SupplejackApi::RecordMetric.where(date: date_range).map(&:date).uniq - dates.each do |date| - Rails.logger.info("COLLECTION METRICS: Processing date: #{date}") - collections = SupplejackApi::RecordMetric.where(date:).pluck(:display_collection).uniq + record_metrics_dates_between(date_range).each do |date| + logger.info("COLLECTION METRIC: Processing date: #{date}") + display_collections = SupplejackApi::RecordMetric + .where(date:, processed_by_collection_metrics: false) + .distinct(:display_collection) - collections.each do |collection| - Rails.logger.info("COLLECTION METRICS: Processing collection: #{collection}") - record_metrics = record_metrics_to_be_processed(date, collection) - collection_metrics = find_or_create_by(date:, display_collection: collection).inc( - searches: record_metrics.sum(:appeared_in_searches), - record_page_views: record_metrics.sum(:page_views), - user_set_views: record_metrics.sum(:user_set_views), - user_story_views: record_metrics.sum(:user_story_views), - records_added_to_user_sets: record_metrics.sum(:added_to_user_sets), - records_added_to_user_stories: record_metrics.sum(:added_to_user_stories), - total_source_clickthroughs: record_metrics.sum(:source_clickthroughs) - ) + display_collections.each do |display_collection| + logger.info("COLLECTION METRIC: Processing collection: #{display_collection}") + record_metrics = record_metrics_to_be_processed(date, display_collection) - if collection_metrics.save + if update_collection_metrics(record_metrics, date, display_collection) record_metrics.update_all(processed_by_collection_metrics: true) else - Rails.logger.error "Unable to summarize record metrics from collection: #{collection} date: #{date}" + logger.error "Unable to summarize record metrics from collection: #{collection} date: #{date}" end end regenerate_all_collection_metrics!(date) end end + def self.update_collection_metrics(record_metrics, date, display_collection) + collection_metrics = find_or_create_by(date:, display_collection:).inc( + searches: record_metrics.sum(:appeared_in_searches), + record_page_views: record_metrics.sum(:page_views), + user_set_views: record_metrics.sum(:user_set_views), + user_story_views: record_metrics.sum(:user_story_views), + records_added_to_user_sets: record_metrics.sum(:added_to_user_sets), + records_added_to_user_stories: record_metrics.sum(:added_to_user_stories), + total_source_clickthroughs: record_metrics.sum(:source_clickthroughs) + ) + + collection_metrics.save + end + def self.record_metrics_to_be_processed(date, display_collection) - Rails.logger.info("COLLECTION METRICS: Gathering records to be processed: #{date} #{display_collection}") + logger.info("COLLECTION METRIC: Gathering records to be processed: #{date} #{display_collection}") SupplejackApi::RecordMetric.where( date:, display_collection:, - :processed_by_collection_metrics.in => [nil, '', false] + processed_by_collection_metrics: false ) end def self.regenerate_all_collection_metrics!(date) - Rails.logger.info("COLLECTION METRICS: Regenerate all collection metrics #{date}") + logger.info("COLLECTION METRIC: Regenerate all collection metrics #{date}") delete_all(date:, display_collection: 'all') + logger.info('COLLECTION METRIC: deleted_all') all_collections = new(date:, display_collection: 'all') where(date:, :display_collection.nin => ['all']).find_all do |collection| all_collections.inc( @@ -86,6 +94,11 @@ def self.regenerate_all_collection_metrics!(date) total_source_clickthroughs: collection.total_source_clickthroughs ).save! end + logger.info('COLLECTION METRIC: saved') + end + + def self.record_metrics_dates_between(date_range) + record_metrics_dates_between_for(:processed_by_collection_metrics, date_range) end end end diff --git a/app/models/supplejack_api/concerns/metric_helpers.rb b/app/models/supplejack_api/concerns/metric_helpers.rb new file mode 100644 index 000000000..d7310e92d --- /dev/null +++ b/app/models/supplejack_api/concerns/metric_helpers.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +module SupplejackApi + module Concerns + module MetricHelpers + extend ActiveSupport::Concern + + module ClassMethods + # produce a logging prefix matching the original style in the model files + # e.g. "TopMetric" -> "TOP METRIC", "TopCollectionMetric" -> "TOP COLLECTION METRIC" + def log_prefix + klass = name.to_s.split('::').last + klass.gsub(/([a-z\d])([A-Z])/, '\1 \2').tr('_', ' ').upcase + end + + # Fetch distinct dates for RecordMetric where the given processed flag is false + def record_metrics_dates_between_for(processed_field, date_range) + logger.info("#{log_prefix}: Fetching dates for #{processed_field}") + dates = SupplejackApi::RecordMetric + .where(date: date_range, processed_field => false) + .distinct(:date) + logger.info("#{log_prefix}: Processing dates: #{dates}") + dates + end + + # Mark all RecordMetric rows for a given date as processed using the given flag + def stamp_record_metrics_for(processed_field, date) + logger.info("#{log_prefix}: Stamping all records on #{date} for #{processed_field}") + SupplejackApi::RecordMetric + .where(date:, processed_field => false) + .update_all(processed_field => true) + logger.info("#{log_prefix}: Stamped all records on: #{date} for #{processed_field}") + end + end + end + end +end diff --git a/app/models/supplejack_api/record_metric.rb b/app/models/supplejack_api/record_metric.rb index 5bf6fd490..db7248a86 100644 --- a/app/models/supplejack_api/record_metric.rb +++ b/app/models/supplejack_api/record_metric.rb @@ -24,14 +24,18 @@ class RecordMetric index({ record_id: 1, display_collection: 1, date: 1 }, background: true) - index({ display_collection: 1, date: 1, processed_by_collection_metrics: 1 }, background: true) - index({ display_collection: 1, date: 1, processed_by_top_metrics: 1 }, background: true) - index({ display_collection: 1, date: 1, processed_by_top_collection_metrics: 1 }, background: true) + index({ date: 1, display_collection: 1, processed_by_collection_metrics: 1 }, background: true) + index({ date: 1, display_collection: 1, processed_by_top_metrics: 1 }, background: true) + index({ date: 1, display_collection: 1, processed_by_top_collection_metrics: 1 }, background: true) index({ display_collection: 1, date: 1 }, background: true) index({ date: 1 }, background: true) + index({ date: 1, processed_by_collection_metrics: 1 }, background: true) + index({ date: 1, processed_by_top_metrics: 1 }, background: true) + index({ date: 1, processed_by_top_collection_metrics: 1 }, background: true) + index({ processed_by_collection_metrics: 1 }, background: true) index({ processed_by_top_metrics: 1 }, background: true) index({ processed_by_top_collection_metrics: 1 }, background: true) @@ -41,6 +45,7 @@ class RecordMetric processed_by_top_metrics: 1, processed_by_top_collection_metrics: 1 }, + name: 'all_metrics', background: true ) @@ -50,9 +55,34 @@ def self.spawn(record_id, metrics, display_collection, date = Time.now.utc.begin collection.update_one( { record_id:, date: date.to_date, display_collection: }, - { '$inc' => metrics }, + { + '$setOnInsert' => { + processed_by_collection_metrics: false, + processed_by_top_metrics: false, + processed_by_top_collection_metrics: false + }, + '$inc' => metrics + }, upsert: true ) end + + # this method deletes processed metrics in batches to avoid memory issues + # and loads on the db + def self.delete_all_processed_metrics(batch_size = 5_000, sleep_time = 0.05) + scope = SupplejackApi::RecordMetric.where( + processed_by_collection_metrics: true, + processed_by_top_metrics: true, + processed_by_top_collection_metrics: true + ) + + loop do + ids = scope.only(:_id).limit(batch_size).pluck(:id) + break if ids.empty? + + SupplejackApi::RecordMetric.where(:_id.in => ids).delete_all + sleep sleep_time + end + end end end diff --git a/app/models/supplejack_api/request_metric.rb b/app/models/supplejack_api/request_metric.rb index 788c65d28..e18a483d7 100644 --- a/app/models/supplejack_api/request_metric.rb +++ b/app/models/supplejack_api/request_metric.rb @@ -49,8 +49,10 @@ def self.summarize metrics.each do |metric| metric.records.each do |record| - summary[date][record['record_id']]['metrics'][metric.metric] += 1 - summary[date][record['record_id']]['display_collection'] = record['display_collection'] + record_id = record['record_id'] + entry = summary[date][record_id] + entry['metrics'][metric.metric] += 1 + entry['display_collection'] = record['display_collection'] end end end diff --git a/app/models/supplejack_api/top_collection_metric.rb b/app/models/supplejack_api/top_collection_metric.rb index d280545c4..b191706e5 100644 --- a/app/models/supplejack_api/top_collection_metric.rb +++ b/app/models/supplejack_api/top_collection_metric.rb @@ -6,6 +6,7 @@ class TopCollectionMetric include Mongoid::Document include Mongoid::Timestamps include SupplejackApi::Concerns::QueryableByDate + include SupplejackApi::Concerns::MetricHelpers METRICS = %i[ page_views @@ -33,10 +34,7 @@ def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginni metrics = [] - dates = SupplejackApi::RecordMetric.where(date: date_range).map(&:date).uniq - Rails.logger.info("TOP COLLECTION METRIC: processing dates: #{dates}") - - dates.each do |date| + record_metrics_dates_between(date_range).each do |date| display_collections(date).each do |dc| METRICS.each do |metric| record_metrics = record_metrics_to_be_processed(date, metric, dc) @@ -52,7 +50,7 @@ def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginni metrics.push(top_collection_metric) end end - Rails.logger.info("TOP COLLECTION METRIC: Stampping all records on #{date}") + stamp_record_metrics(date) end @@ -60,10 +58,10 @@ def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginni end def self.display_collections(date) - Rails.logger.info("TOP COLLECTION METRIC: Finding all display collections on #{date}") + logger.info("TOP COLLECTION METRIC: Finding all display collections on #{date}") SupplejackApi::RecordMetric.where( date:, - :processed_by_top_collection_metrics.in => [nil, '', false] + processed_by_top_collection_metrics: false ).map(&:display_collection).uniq end @@ -75,11 +73,13 @@ def self.calculate_results(record_metrics, metric) end def self.update_top_collection_metric(top_collection_metric, results) - if top_collection_metric.results.blank? + existing_results = top_collection_metric.results + + if existing_results.blank? top_collection_metric.update(results:) else - merged_results = top_collection_metric.results.merge(results) { |_key, a, b| a + b } - merged_results = merged_results.sort_by { |_k, v| -v }.first(200).to_h + merged_results = existing_results.merge(results) { |_key, existing, incoming| existing + incoming } + merged_results = merged_results.sort_by { |_k, value| -value }.first(200).to_h top_collection_metric.update(results: merged_results) end @@ -96,18 +96,22 @@ def self.find_or_create_top_collection_metric(date, metric, display_collection) end def self.record_metrics_to_be_processed(date, metric, display_collection) - Rails.logger.info("TOP COLLECTION METRIC: Gathering top 200 records to be - processed #{date}, #{metric}, #{display_collection}") + logger.info('TOP COLLECTION METRIC: ' \ + "Gathering top 200 records to be processed #{date}, #{metric}, #{display_collection}") SupplejackApi::RecordMetric.where( date:, metric.ne => 0, display_collection:, - :processed_by_top_collection_metrics.in => [nil, '', false] + processed_by_top_collection_metrics: false ).order_by(metric => 'desc').limit(200) end + def self.record_metrics_dates_between(date_range) + record_metrics_dates_between_for(:processed_by_top_collection_metrics, date_range) + end + def self.stamp_record_metrics(date) - SupplejackApi::RecordMetric.where(date:).update_all(processed_by_top_collection_metrics: true) + stamp_record_metrics_for(:processed_by_top_collection_metrics, date) end end end diff --git a/app/models/supplejack_api/top_metric.rb b/app/models/supplejack_api/top_metric.rb index 7e780c823..6a6caeae9 100644 --- a/app/models/supplejack_api/top_metric.rb +++ b/app/models/supplejack_api/top_metric.rb @@ -13,6 +13,7 @@ module SupplejackApi class TopMetric include Mongoid::Document include SupplejackApi::Concerns::QueryableByDate + include SupplejackApi::Concerns::MetricHelpers METRICS = %i[ page_views @@ -37,10 +38,7 @@ class TopMetric def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginning_of_day)) return unless SupplejackApi.config.log_metrics == true - dates = SupplejackApi::RecordMetric.where(date: date_range).map(&:date).uniq - Rails.logger.info("TOP METRIC: processing dates: #{dates}") - - dates.each do |date| + record_metrics_dates_between(date_range).each do |date| METRICS.each do |metric| record_metrics = record_metrics_to_be_processed(date, metric) results = record_metrics.each_with_object({}) do |record, hash| @@ -49,32 +47,38 @@ def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginni next if results.empty? - metric = find_or_create_by( - date:, - metric: - ) + top_metric = find_or_create_by(date:, metric:) + existing_results = top_metric.results - if metric.results.blank? - metric.update(results:) + if existing_results.blank? + top_metric.update(results:) else - merged_results = metric.results.merge(results) { |_key, a, b| a + b } - merged_results = merged_results.sort_by { |_k, v| -v }.first(200).to_h + merged_results = existing_results.merge(results) { |_key, existing, incoming| existing + incoming } + merged_results = merged_results.sort_by { |_k, value| -value }.first(200).to_h - metric.update(results: merged_results) + top_metric.update(results: merged_results) end end - Rails.logger.info("TOP METRIC: Stampping all records on: #{date}") - SupplejackApi::RecordMetric.where(date:).update_all(processed_by_top_metrics: true) + + stamp_record_metrics(date) end end def self.record_metrics_to_be_processed(date, metric) - Rails.logger.info("TOP METRIC: Gathering records to be processed: #{date} #{metric}") + logger.info("TOP METRIC: Gathering records to be processed: #{date} #{metric}") SupplejackApi::RecordMetric.where( date:, metric.ne => 0, - :processed_by_top_metrics.in => [nil, '', false] + processed_by_top_metrics: false ).order_by(metric => 'desc').limit(200) end + + def self.record_metrics_dates_between(date_range) + record_metrics_dates_between_for(:processed_by_top_metrics, date_range) + end + + def self.stamp_record_metrics(date) + stamp_record_metrics_for(:processed_by_top_metrics, date) + end end end