Skip to content

migration ignores documents already in DB #445

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 8, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 75 additions & 56 deletions migration/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,11 @@ async fn migrate_data(db: &Database) -> Result<()> {
let mut morpheme_relations = Vec::new();
let mut document_contents = Vec::new();
for (coll_index, collection) in index.collections.into_iter().enumerate() {
// TODO do we need to add genres to the database like this? Do we use this anywhere??
let collection_id = db
.insert_top_collection(collection.title, coll_index as i64)
.await?;

for (order_index, sheet_id) in collection.sheet_ids.into_iter().enumerate() {
if let Some((doc, mut refs)) =
fetch_sheet(Some(db), &sheet_id, collection_id, order_index as i64).await?
Expand All @@ -120,6 +122,7 @@ async fn migrate_data(db: &Database) -> Result<()> {

/// Fetch the contents of the sheet with the given ID, validating the first page as
/// annotation lines and the "Metadata" page as [dailp::DocumentMetadata].
/// Ignores documents already present in the database.
async fn fetch_sheet(
db: Option<&Database>,
sheet_id: &str,
Expand All @@ -132,69 +135,85 @@ async fn fetch_sheet(
// This includes publication information and a link to the translation.
let meta = SheetResult::from_sheet(sheet_id, Some(METADATA_SHEET_NAME)).await;

if let Ok(meta_sheet) = meta {
let meta = SheetInterpretation { sheet: meta_sheet }
.into_metadata(db, false, order_index)
.await?;
if meta.is_err() {
return Ok(None);
}

println!("---Processing document: {}---", meta.short_name);
let meta_sheet = meta.unwrap();
let meta = SheetInterpretation { sheet: meta_sheet }
.into_metadata(db, false, order_index)
.await?;

// Parse references for this particular document.
println!("parsing references...");
let refs = SheetResult::from_sheet(sheet_id, Some(REFERENCES_SHEET_NAME)).await;
let refs = if let Ok(refs) = refs {
SheetInterpretation { sheet: refs }
.into_references(&meta.short_name)
.await
} else {
Vec::new()
};
if db.is_none() {
return Ok(None);
}

let document_id = if let Some(db) = db {
db.insert_document(&meta, collection_id, order_index)
.await?
} else {
Default::default()
};
// Fill in blank UUID.
let meta = dailp::DocumentMetadata {
id: document_id,
..meta
};
// Check if this document exists in the database
let db = db.unwrap();
let doc_id_in_db = db.document_id_from_name(&meta.short_name).await?;

if doc_id_in_db.is_some() {
println!(
"{} already exists with ID {}.",
meta.short_name,
doc_id_in_db.unwrap().0
);
return Ok(None);
}

let page_count = meta
.page_images
.as_ref()
.map(|images| images.count())
.unwrap_or(0);
let mut all_lines = Vec::new();
// Each document page lives in its own tab.
for index in 0..page_count {
let tab_name = if page_count > 1 {
println!("Pulling Page {}...", index + 1);
Some(format!("Page {}", index + 1))
} else {
None
};
println!("---Processing document: {}---", meta.short_name);

// Split the contents of each main sheet into semantic lines with
// several layers.
let mut lines = SheetInterpretation {
sheet: SheetResult::from_sheet(sheet_id, tab_name.as_deref()).await?,
}
.split_into_lines();
// TODO Consider page breaks embedded in the last word of a page.
lines.last_mut().unwrap().ends_page = true;
// Parse references for this particular document.
println!("parsing references...");
let refs = SheetResult::from_sheet(sheet_id, Some(REFERENCES_SHEET_NAME)).await;
let refs = if let Ok(refs) = refs {
SheetInterpretation { sheet: refs }
.into_references(&meta.short_name)
.await
} else {
Vec::new()
};

let document_id = db
.insert_document(&meta, collection_id, order_index)
.await?;

// Fill in blank UUID.
let meta = dailp::DocumentMetadata {
id: document_id,
..meta
};

let page_count = meta
.page_images
.as_ref()
.map(|images| images.count())
.unwrap_or(0);
let mut all_lines = Vec::new();
// Each document page lives in its own tab.
for index in 0..page_count {
let tab_name = if page_count > 1 {
println!("Pulling Page {}...", index + 1);
Some(format!("Page {}", index + 1))
} else {
None
};

all_lines.append(&mut lines);
tokio::time::sleep(Duration::from_millis(1000)).await;
// Split the contents of each main sheet into semantic lines with
// several layers.
let mut lines = SheetInterpretation {
sheet: SheetResult::from_sheet(sheet_id, tab_name.as_deref()).await?,
}
let annotated = AnnotatedLine::many_from_semantic(&all_lines, &meta)?;
let segments = AnnotatedLine::lines_into_segments(annotated, &document_id, &meta.date);
let doc = dailp::AnnotatedDoc::new(meta, segments);
.split_into_lines();
// TODO Consider page breaks embedded in the last word of a page.
lines.last_mut().unwrap().ends_page = true;

Ok(Some((doc, refs)))
} else {
Ok(None)
all_lines.append(&mut lines);
tokio::time::sleep(Duration::from_millis(1000)).await;
}
let annotated = AnnotatedLine::many_from_semantic(&all_lines, &meta)?;
let segments = AnnotatedLine::lines_into_segments(annotated, &document_id, &meta.date);
let doc = dailp::AnnotatedDoc::new(meta, segments);

Ok(Some((doc, refs)))
}