Skip to content

Commit

Permalink
improvements to the mbox import script
Browse files Browse the repository at this point in the history
* ignores dot-files and empty emails
* new setting to prefer HTML over plaintext emails during import
* restore original site settings at the end of import
* elided content of HTML mails was not put inside details block
  • Loading branch information
gschlager committed Nov 18, 2017
1 parent 6dda87c commit 32dd1e6
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 11 deletions.
1 change: 1 addition & 0 deletions script/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
tmp/*
settings.local.yml
11 changes: 6 additions & 5 deletions script/import_scripts/mbox/importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ def initialize(settings)
@database = Database.new(@settings.data_dir, @settings.batch_size)
end

def change_site_settings
super

SiteSetting.enable_staged_users = true
def get_site_settings_for_import
settings = super
settings[:enable_staged_users] = true
settings[:incoming_email_prefer_html] = @settings.prefer_html
settings
end

protected
Expand Down Expand Up @@ -120,7 +121,7 @@ def format_raw(email_body, attachment_html, elided, format)
when Email::Receiver::formats[:markdown]
body = email_body
body << attachment_html if attachment_html.present?
body << elided if elided.present?
body << Email::Receiver.elided_html(elided) if elided.present?
when Email::Receiver::formats[:plaintext]
body = %|[plaintext]\n#{escape_tags(email_body)}\n[/plaintext]|
body << %|\n[attachments]\n#{escape_tags(attachment_html)}\n[/attachments]| if attachment_html.present?
Expand Down
5 changes: 3 additions & 2 deletions script/import_scripts/mbox/settings.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# PostgreSQL mailing lists
#data_dir: /data/import/postgres
#data_dir: /shared/import/data
#split_regex: "^From .*@postgresql.org.*"

# ruby-talk mailing list
data_dir: /data/import/ruby-talk/news/gmane/comp/lang/ruby
data_dir: /shared/import/data
split_regex: ""

default_trust_level: 1
prefer_html: false
15 changes: 11 additions & 4 deletions script/import_scripts/mbox/support/indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,12 @@ def all_messages(directory, category_name)

if @split_regex.present?
each_mail(filename) do |raw_message, first_line_number, last_line_number|
yield read_mail_from_string(raw_message), filename, first_line_number, last_line_number
receiver = read_mail_from_string(raw_message)
yield receiver, filename, first_line_number, last_line_number if receiver.present?
end
else
yield read_mail_from_file(filename), filename
receiver = read_mail_from_file(filename)
yield receiver, filename if receiver.present?
end

mark_as_fully_indexed(category_name, filename)
Expand Down Expand Up @@ -161,7 +163,7 @@ def read_mail_from_file(filename)
end

def read_mail_from_string(raw_message)
Email::Receiver.new(raw_message)
Email::Receiver.new(raw_message) unless raw_message.blank?
end

def extract_reply_message_ids(mail)
Expand Down Expand Up @@ -208,7 +210,12 @@ def clean_subject(subject)
end

def ignored_file?(filename, checksums)
File.directory?(filename) || metadata_file?(filename) || fully_indexed?(filename, checksums)
File.directory?(filename) || hidden_file?(filename) ||
metadata_file?(filename) || fully_indexed?(filename, checksums)
end

def hidden_file?(filename)
File.basename(filename).start_with?('.')
end

def metadata_file?(filename)
Expand Down
2 changes: 2 additions & 0 deletions script/import_scripts/mbox/support/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ def self.load(filename)
attr_reader :split_regex
attr_reader :batch_size
attr_reader :trust_level
attr_reader :prefer_html

def initialize(yaml)
@data_dir = yaml['data_dir']
@split_regex = Regexp.new(yaml['split_regex']) unless yaml['split_regex'].empty?
@batch_size = 1000 # no need to make this actually configurable at the moment
@trust_level = yaml['default_trust_level']
@prefer_html = yaml['prefer_html']
end
end
end

0 comments on commit 32dd1e6

Please sign in to comment.