From 23f92a0cd4882ba0d4ed239e140fc5fc073abbb6 Mon Sep 17 00:00:00 2001 From: Naomi Trevino Date: Thu, 27 Feb 2025 15:12:20 -0800 Subject: [PATCH] migration ignores documents already in DB --- migration/src/main.rs | 131 ++++++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 56 deletions(-) diff --git a/migration/src/main.rs b/migration/src/main.rs index df5bf980..e872d34c 100644 --- a/migration/src/main.rs +++ b/migration/src/main.rs @@ -91,9 +91,11 @@ async fn migrate_data(db: &Database) -> Result<()> { let mut morpheme_relations = Vec::new(); let mut document_contents = Vec::new(); for (coll_index, collection) in index.collections.into_iter().enumerate() { + // TODO do we need to add genres to the database like this? Do we use this anywhere?? let collection_id = db .insert_top_collection(collection.title, coll_index as i64) .await?; + for (order_index, sheet_id) in collection.sheet_ids.into_iter().enumerate() { if let Some((doc, mut refs)) = fetch_sheet(Some(db), &sheet_id, collection_id, order_index as i64).await? @@ -120,6 +122,7 @@ async fn migrate_data(db: &Database) -> Result<()> { /// Fetch the contents of the sheet with the given ID, validating the first page as /// annotation lines and the "Metadata" page as [dailp::DocumentMetadata]. +/// Ignores documents already present in the database. async fn fetch_sheet( db: Option<&Database>, sheet_id: &str, @@ -132,69 +135,85 @@ async fn fetch_sheet( // This includes publication information and a link to the translation. let meta = SheetResult::from_sheet(sheet_id, Some(METADATA_SHEET_NAME)).await; - if let Ok(meta_sheet) = meta { - let meta = SheetInterpretation { sheet: meta_sheet } - .into_metadata(db, false, order_index) - .await?; + if meta.is_err() { + return Ok(None); + } - println!("---Processing document: {}---", meta.short_name); + let meta_sheet = meta.unwrap(); + let meta = SheetInterpretation { sheet: meta_sheet } + .into_metadata(db, false, order_index) + .await?; - // Parse references for this particular document. - println!("parsing references..."); - let refs = SheetResult::from_sheet(sheet_id, Some(REFERENCES_SHEET_NAME)).await; - let refs = if let Ok(refs) = refs { - SheetInterpretation { sheet: refs } - .into_references(&meta.short_name) - .await - } else { - Vec::new() - }; + if db.is_none() { + return Ok(None); + } - let document_id = if let Some(db) = db { - db.insert_document(&meta, collection_id, order_index) - .await? - } else { - Default::default() - }; - // Fill in blank UUID. - let meta = dailp::DocumentMetadata { - id: document_id, - ..meta - }; + // Check if this document exists in the database + let db = db.unwrap(); + let doc_id_in_db = db.document_id_from_name(&meta.short_name).await?; + + if doc_id_in_db.is_some() { + println!( + "{} already exists with ID {}.", + meta.short_name, + doc_id_in_db.unwrap().0 + ); + return Ok(None); + } - let page_count = meta - .page_images - .as_ref() - .map(|images| images.count()) - .unwrap_or(0); - let mut all_lines = Vec::new(); - // Each document page lives in its own tab. - for index in 0..page_count { - let tab_name = if page_count > 1 { - println!("Pulling Page {}...", index + 1); - Some(format!("Page {}", index + 1)) - } else { - None - }; + println!("---Processing document: {}---", meta.short_name); - // Split the contents of each main sheet into semantic lines with - // several layers. - let mut lines = SheetInterpretation { - sheet: SheetResult::from_sheet(sheet_id, tab_name.as_deref()).await?, - } - .split_into_lines(); - // TODO Consider page breaks embedded in the last word of a page. - lines.last_mut().unwrap().ends_page = true; + // Parse references for this particular document. + println!("parsing references..."); + let refs = SheetResult::from_sheet(sheet_id, Some(REFERENCES_SHEET_NAME)).await; + let refs = if let Ok(refs) = refs { + SheetInterpretation { sheet: refs } + .into_references(&meta.short_name) + .await + } else { + Vec::new() + }; + + let document_id = db + .insert_document(&meta, collection_id, order_index) + .await?; + + // Fill in blank UUID. + let meta = dailp::DocumentMetadata { + id: document_id, + ..meta + }; + + let page_count = meta + .page_images + .as_ref() + .map(|images| images.count()) + .unwrap_or(0); + let mut all_lines = Vec::new(); + // Each document page lives in its own tab. + for index in 0..page_count { + let tab_name = if page_count > 1 { + println!("Pulling Page {}...", index + 1); + Some(format!("Page {}", index + 1)) + } else { + None + }; - all_lines.append(&mut lines); - tokio::time::sleep(Duration::from_millis(1000)).await; + // Split the contents of each main sheet into semantic lines with + // several layers. + let mut lines = SheetInterpretation { + sheet: SheetResult::from_sheet(sheet_id, tab_name.as_deref()).await?, } - let annotated = AnnotatedLine::many_from_semantic(&all_lines, &meta)?; - let segments = AnnotatedLine::lines_into_segments(annotated, &document_id, &meta.date); - let doc = dailp::AnnotatedDoc::new(meta, segments); + .split_into_lines(); + // TODO Consider page breaks embedded in the last word of a page. + lines.last_mut().unwrap().ends_page = true; - Ok(Some((doc, refs))) - } else { - Ok(None) + all_lines.append(&mut lines); + tokio::time::sleep(Duration::from_millis(1000)).await; } + let annotated = AnnotatedLine::many_from_semantic(&all_lines, &meta)?; + let segments = AnnotatedLine::lines_into_segments(annotated, &document_id, &meta.date); + let doc = dailp::AnnotatedDoc::new(meta, segments); + + Ok(Some((doc, refs))) }