diff --git a/audit/common/CodeNode.go b/audit/common/CodeNode.go index 550a19d..2414738 100644 --- a/audit/common/CodeNode.go +++ b/audit/common/CodeNode.go @@ -4,14 +4,15 @@ import "time" // CodeNode captures metadata about a specific code example. The `Code` field contains the example itself. type CodeNode struct { - Code string `bson:"code"` - Language string `bson:"language"` - FileExtension string `bson:"file_extension"` - Category string `bson:"category"` - SHA256Hash string `bson:"sha_256_hash"` - LLMCategorized bool `bson:"llm_categorized"` - DateAdded time.Time `bson:"date_added"` - DateUpdated time.Time `bson:"date_updated,omitempty"` - DateRemoved time.Time `bson:"date_removed,omitempty"` - IsRemoved bool `bson:"is_removed,omitempty"` + Code string `bson:"code"` + Language string `bson:"language"` + FileExtension string `bson:"file_extension"` + Category string `bson:"category"` + SHA256Hash string `bson:"sha_256_hash"` + LLMCategorized bool `bson:"llm_categorized"` + DateAdded time.Time `bson:"date_added"` + DateUpdated time.Time `bson:"date_updated,omitempty"` + DateRemoved time.Time `bson:"date_removed,omitempty"` + IsRemoved bool `bson:"is_removed,omitempty"` + InstancesOnPage int `bson:"instances_on_page,omitempty"` } diff --git a/audit/gdcd/CheckPagesForUpdates.go b/audit/gdcd/CheckPagesForUpdates.go index c8fdc57..91bdb1e 100644 --- a/audit/gdcd/CheckPagesForUpdates.go +++ b/audit/gdcd/CheckPagesForUpdates.go @@ -3,9 +3,12 @@ package main import ( "common" "context" + "fmt" "gdcd/db" + "gdcd/snooty" "gdcd/types" "gdcd/utils" + "time" "github.com/tmc/langchaingo/llms/ollama" ) @@ -17,8 +20,13 @@ import ( func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetails, llm *ollama.LLM, ctx context.Context, report types.ProjectReport) { incomingPageIdsMatchingExistingPages := make(map[string]bool) incomingDeletedPageCount := 0 - var newPageIds []string - var newPages []common.DocsPage + + // When a page doesn't match one in the DB, it could be either net new or a moved page. Hold it in a temp array + // for comparison + var maybeNewPages []types.NewOrMovedPage + var newPages []types.NewOrMovedPage + var newPageDBEntries []common.DocsPage + var movedPages []types.NewOrMovedPage var updatedPages []common.DocsPage for _, page := range pages { // The Snooty Data API returns pages that may have been deleted. If the page is deleted, we want to check and see @@ -27,6 +35,7 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail if page.Data.Deleted { report = HandleDeletedIncomingPages(project.ProjectName, page, report) incomingDeletedPageCount++ + utils.UpdateSecondaryTarget() } else { maybeExistingPage := CheckForExistingPage(project.ProjectName, page) if maybeExistingPage != nil { @@ -39,20 +48,72 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail if updatedPage != nil { updatedPages = append(updatedPages, *updatedPage) } + utils.UpdateSecondaryTarget() } else { - // If there is no existing document in Atlas that matches the page, we need to make a new page - var newPage common.DocsPage - newPage, report = MakeNewPage(page, project.ProdUrl, report, llm, ctx) - newPageIds = append(newPageIds, newPage.ID) - newPages = append(newPages, newPage) + // If there is no existing document in Atlas that matches the page, we need to make a new page. BUT! + // It might actually be a new or moved page. So store it in a temp `maybeNewPages` slice so we can compare + // it against removed pages later and potentially call it a "moved" page, instead. + newOrMovedPage := getNewOrMovedPageDetails(page.Data) + maybeNewPages = append(maybeNewPages, newOrMovedPage) } } - utils.UpdateSecondaryTarget() } // After iterating through the incoming pages from the Snooty Data API, we need to figure out if any of the page IDs - // we had in the DB are not coming in from the incoming response. If so, we should delete those entries. - report = db.HandleMissingPageIds(project.ProjectName, incomingPageIdsMatchingExistingPages, report) + // we had in the DB are not coming in from the incoming response. If so, those pages are either moved or removed. + report, newPages, movedPages = db.HandleMissingPageIds(project.ProjectName, incomingPageIdsMatchingExistingPages, maybeNewPages, report) + + // If we have new pages, create the corresponding DocsPage and increment the project report for them + if newPages != nil { + for _, page := range newPages { + newPage := MakeNewPage(page.PageData, project.ProjectName, project.ProdUrl, llm, ctx) + newPageDBEntries = append(newPageDBEntries, newPage) + report = UpdateProjectReportForNewPage(newPage, report) + utils.UpdateSecondaryTarget() + } + } + + // If we have moved pages, handle them + if movedPages != nil { + for _, page := range movedPages { + var movedPage common.DocsPage + oldPage := db.GetAtlasPageData(project.ProjectName, page.OldPageId) + + if oldPage != nil { + movedPage = *oldPage + movedPage.ID = page.NewPageId + newPageUrl := utils.ConvertAtlasPageIdToProductionUrl(page.NewPageId, project.ProdUrl) + movedPage.DateLastUpdated = time.Now() + movedPage.PageURL = newPageUrl + } else { + movedPage = MakeNewPage(page.PageData, project.ProjectName, project.ProdUrl, llm, ctx) + movedPage.DateAdded = page.DateAdded + } + + // Remove the old page from the DB + db.RemovePageFromAtlas(project.ProjectName, page.OldPageId) + + // Append the "moved" page to the `newPageDBEntries` array. Because the page ID doesn't match the old one, + // we write it to the DB as a new page. Because we just deleted the old page, it works out to the same count + // and provides the up-to-date data in the DB. + newPageDBEntries = append(newPageDBEntries, movedPage) + + incomingAstCodeNodes, incomingAstLiteralIncludeNodes, incomingAstIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(page.PageData.AST) + incomingAstCodeNodeCount := len(incomingAstCodeNodes) + incomingAstLiteralIncludeNodesCount := len(incomingAstLiteralIncludeNodes) + incomingAstIoCodeBlockNodesCount := len(incomingAstIoCodeBlockNodes) + // Update the project counts for the "existing" page + report = IncrementProjectCountsForExistingPage(incomingAstCodeNodeCount, incomingAstLiteralIncludeNodesCount, incomingAstIoCodeBlockNodesCount, movedPage, report) + + // Report it in the logs as a moved page + stringMessageForReport := fmt.Sprintf("Old page ID: %s, new page ID: %s", page.OldPageId, page.NewPageId) + report = utils.ReportChanges(types.PageMoved, report, stringMessageForReport) + if movedPage.CodeNodesTotal != incomingAstCodeNodeCount { + utils.ReportIssues(types.CodeNodeCountIssue, report, page.NewPageId, page.CodeNodeCount, len(incomingAstCodeNodes)) + } + utils.UpdateSecondaryTarget() + } + } // Get the existing "summaries" document from the DB, and update it. var summaryDoc common.CollectionReport @@ -65,5 +126,17 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail LogReportForProject(project.ProjectName, report) // At this point, we have all the new and updated pages and an updated summary. Write updates to Atlas. - db.BatchUpdateCollection(project.ProjectName, newPages, updatedPages, summaryDoc) + db.BatchUpdateCollection(project.ProjectName, newPageDBEntries, updatedPages, summaryDoc) +} + +func getNewOrMovedPageDetails(metadata types.PageMetadata) types.NewOrMovedPage { + incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(metadata.AST) + pageId := utils.ConvertSnootyPageIdToAtlasPageId(metadata.PageID) + return types.NewOrMovedPage{ + PageId: pageId, + CodeNodeCount: len(incomingCodeNodes), + LiteralIncludeCount: len(incomingLiteralIncludeNodes), + IoCodeBlockCount: len(incomingIoCodeBlockNodes), + PageData: metadata, + } } diff --git a/audit/gdcd/UpdateProjectReportForUpdatedCodeNodes.go b/audit/gdcd/IncrementProjectCountsForExistingPage.go similarity index 100% rename from audit/gdcd/UpdateProjectReportForUpdatedCodeNodes.go rename to audit/gdcd/IncrementProjectCountsForExistingPage.go diff --git a/audit/gdcd/MakeLanguagesArray.go b/audit/gdcd/MakeLanguagesArray.go index 8d7bf96..296c08e 100644 --- a/audit/gdcd/MakeLanguagesArray.go +++ b/audit/gdcd/MakeLanguagesArray.go @@ -18,12 +18,23 @@ func MakeLanguagesArray(codeNodes []common.CodeNode, literalIncludeNodes []types continue } else { if languageCounts, exists := languages[node.Language]; exists { - languageCounts.Total += 1 - languages[node.Language] = languageCounts + if node.InstancesOnPage > 0 { + languageCounts.Total += node.InstancesOnPage + languages[node.Language] = languageCounts + } else { + languageCounts.Total += 1 + languages[node.Language] = languageCounts + } } else { - countsForLang := languages[common.Undefined] - countsForLang.LiteralIncludes += 1 - languages[common.Undefined] = countsForLang + if node.InstancesOnPage > 0 { + countsForLang := languages[common.Undefined] + countsForLang.Total += node.InstancesOnPage + languages[common.Undefined] = countsForLang + } else { + countsForLang := languages[common.Undefined] + countsForLang.Total += 1 + languages[common.Undefined] = countsForLang + } } } } diff --git a/audit/gdcd/MakeNewPage.go b/audit/gdcd/MakeNewPage.go index 6ac808c..c37c776 100644 --- a/audit/gdcd/MakeNewPage.go +++ b/audit/gdcd/MakeNewPage.go @@ -3,7 +3,6 @@ package main import ( "common" "context" - add_code_examples "gdcd/add-code-examples" "gdcd/snooty" "gdcd/types" "gdcd/utils" @@ -12,35 +11,51 @@ import ( "github.com/tmc/langchaingo/llms/ollama" ) -func MakeNewPage(data types.PageWrapper, siteUrl string, report types.ProjectReport, llm *ollama.LLM, ctx context.Context) (common.DocsPage, types.ProjectReport) { - incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.Data.AST) +func MakeNewPage(data types.PageMetadata, projectName string, siteUrl string, llm *ollama.LLM, ctx context.Context) common.DocsPage { + incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.AST) incomingCodeNodeCount := len(incomingCodeNodes) incomingLiteralIncludeNodeCount := len(incomingLiteralIncludeNodes) incomingIoCodeNodeCount := len(incomingIoCodeBlockNodes) - pageId := utils.ConvertSnootyPageIdToAtlasPageId(data.Data.PageID) - pageUrl := utils.ConvertSnootyPageIdToProductionUrl(data.Data.PageID, siteUrl) - product, subProduct := GetProductSubProduct(report.ProjectName, pageUrl) + pageId := utils.ConvertSnootyPageIdToAtlasPageId(data.PageID) + pageUrl := utils.ConvertSnootyPageIdToProductionUrl(data.PageID, siteUrl) + product, subProduct := GetProductSubProduct(projectName, pageUrl) var isDriversProject bool if product == "Drivers" { isDriversProject = true } else { isDriversProject = false } - newAppliedUsageExampleCount := 0 - var newCodeNodes []common.CodeNode + + // Some of the new code examples coming in from the page may be duplicates. So we first make Sha256 hashes of the + // incoming code examples, and count the number of times the hash appears on the page. + snootySha256Hashes := make(map[string]int) + snootySha256ToAstNodeMap := make(map[string]types.ASTNode) + for _, node := range incomingCodeNodes { - newNode := snooty.MakeCodeNodeFromSnootyAST(node, llm, ctx, isDriversProject) - newCodeNodes = append(newCodeNodes, newNode) - if add_code_examples.IsNewAppliedUsageExample(newNode) { - newAppliedUsageExampleCount++ + // This makes a hash from the whitespace-trimmed AST node. We trim whitespace on AST nodes before adding + // them to the DB, so this ensures an incoming node hash can match a whitespace-trimmed existing node hash. + hash := snooty.MakeSha256HashForCode(node.Value) + + // Add the hash as an entry in the map, and increment its counter. If the hash does not already exist in the map, + // this will create it. If it does already exist, this will just increment its counter. + snootySha256Hashes[hash]++ + snootySha256ToAstNodeMap[hash] = node + } + + // Then, we go through the hashes, create the corresponding codeNodes, and set the `InstancesOnPage` if the example + // appears more than once on the page. + var newCodeNodes []common.CodeNode + for hash, count := range snootySha256Hashes { + newNode := snooty.MakeCodeNodeFromSnootyAST(snootySha256ToAstNodeMap[hash], llm, ctx, isDriversProject) + if count > 1 { + newNode.InstancesOnPage = count } + newCodeNodes = append(newCodeNodes, newNode) } - maybeKeywords := snooty.GetMetaKeywords(data.Data.AST.Children) - languagesArrayValues := MakeLanguagesArray(newCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes) + maybeKeywords := snooty.GetMetaKeywords(data.AST.Children) - // Report relevant details for the new page - report = UpdateProjectReportForNewPage(incomingCodeNodeCount, incomingLiteralIncludeNodeCount, incomingIoCodeNodeCount, len(newCodeNodes), newAppliedUsageExampleCount, pageId, report) + languagesArrayValues := MakeLanguagesArray(newCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes) return common.DocsPage{ ID: pageId, @@ -52,9 +67,9 @@ func MakeNewPage(data types.PageWrapper, siteUrl string, report types.ProjectRep LiteralIncludesTotal: incomingLiteralIncludeNodeCount, Nodes: &newCodeNodes, PageURL: pageUrl, - ProjectName: report.ProjectName, + ProjectName: projectName, Product: product, SubProduct: subProduct, Keywords: maybeKeywords, - }, report + } } diff --git a/audit/gdcd/UpdateExistingPage.go b/audit/gdcd/UpdateExistingPage.go index 5ff9b80..ed2c574 100644 --- a/audit/gdcd/UpdateExistingPage.go +++ b/audit/gdcd/UpdateExistingPage.go @@ -3,8 +3,8 @@ package main import ( "common" "context" - add_code_examples "gdcd/add-code-examples" - compare_code_examples "gdcd/compare-code-examples" + "gdcd/add-code-examples" + "gdcd/compare-code-examples" "gdcd/db" "gdcd/snooty" "gdcd/types" @@ -17,14 +17,15 @@ import ( func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, projectReport types.ProjectReport, llm *ollama.LLM, ctx context.Context) (*common.DocsPage, types.ProjectReport) { var existingCurrentCodeNodes []common.CodeNode var existingRemovedCodeNodes []common.CodeNode + existingCodeNodeCount := 0 // Some of the existing Nodes on the page could have been previously removed from the page. So we need to know which // nodes are "currently" on the page, and which nodes have already been removed. The ones that are "currently" on the // page should be used to compare code examples, but the ones that have already been removed from the page will be // appended to the Nodes array without changes after making all the other updates. if existingPage.Nodes != nil { existingCurrentCodeNodes, existingRemovedCodeNodes = db.GetCurrentRemovedAtlasCodeNodes(*existingPage.Nodes) + existingCodeNodeCount = compare_code_examples.GetCodeNodeCount(*existingPage.Nodes) } - existingCodeNodeCount := len(existingCurrentCodeNodes) incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.Data.AST) maybePageKeywords := snooty.GetMetaKeywords(data.Data.AST.Children) newAppliedUsageExampleCount := 0 @@ -75,7 +76,12 @@ func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, pr node.DateRemoved = time.Now() node.IsRemoved = true updatedCodeNodes = append(updatedCodeNodes, node) - newRemovedNodeCount++ + if node.InstancesOnPage > 0 { + newRemovedNodeCount += node.InstancesOnPage + node.InstancesOnPage = 0 + } else { + newRemovedNodeCount++ + } } else { updatedCodeNodes = append(updatedCodeNodes, node) } @@ -123,7 +129,7 @@ func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, pr newAppliedUsageExampleCount++ } } - newCodeNodeCount := len(newCodeNodes) + newCodeNodeCount := compare_code_examples.GetCodeNodeCount(newCodeNodes) updatedPage.Nodes = &newCodeNodes // Update the AST code node count, io-block-count and literalinclude count diff --git a/audit/gdcd/UpdateProjectReportForNewPage.go b/audit/gdcd/UpdateProjectReportForNewPage.go index 6135bc8..498e4a4 100644 --- a/audit/gdcd/UpdateProjectReportForNewPage.go +++ b/audit/gdcd/UpdateProjectReportForNewPage.go @@ -1,29 +1,43 @@ package main import ( + "common" + "gdcd/add-code-examples" + compare_code_examples "gdcd/compare-code-examples" "gdcd/types" "gdcd/utils" ) -func UpdateProjectReportForNewPage(incomingCodeNodeCount int, incomingLiteralIncludeNodeCount int, incomingIoCodeBlockNodeCount int, newCodeNodes int, newAppliedUsageExampleCount int, pageId string, report types.ProjectReport) types.ProjectReport { - report.Counter.IncomingCodeNodesCount += incomingCodeNodeCount - report.Counter.IncomingLiteralIncludeCount += incomingLiteralIncludeNodeCount - report.Counter.IncomingIoCodeBlockCount += incomingIoCodeBlockNodeCount - report.Counter.NewCodeNodesCount += newCodeNodes - report.Counter.NewAppliedUsageExamplesCount += newAppliedUsageExampleCount +func UpdateProjectReportForNewPage(page common.DocsPage, report types.ProjectReport) types.ProjectReport { + report.Counter.IncomingCodeNodesCount += page.CodeNodesTotal + report.Counter.IncomingLiteralIncludeCount += page.LiteralIncludesTotal + report.Counter.IncomingIoCodeBlockCount += page.IoCodeBlocksTotal + report.Counter.NewCodeNodesCount += page.CodeNodesTotal report.Counter.NewPagesCount += 1 - report = utils.ReportChanges(types.PageCreated, report, pageId) - if newCodeNodes > 0 { - report = utils.ReportChanges(types.CodeExampleCreated, report, pageId, newCodeNodes) + report = utils.ReportChanges(types.PageCreated, report, page.ID) + if page.CodeNodesTotal > 0 { + report = utils.ReportChanges(types.CodeExampleCreated, report, page.ID, page.CodeNodesTotal) + } + + // Figure out how many of the page's code examples are new applied usage examples + newAppliedUsageExampleCount := 0 + newCodeNodeCount := 0 + if page.Nodes != nil { + for _, node := range *page.Nodes { + if add_code_examples.IsNewAppliedUsageExample(node) { + newAppliedUsageExampleCount++ + } + } + newCodeNodeCount = compare_code_examples.GetCodeNodeCount(*page.Nodes) } + report.Counter.NewAppliedUsageExamplesCount += newAppliedUsageExampleCount if newAppliedUsageExampleCount > 0 { - report = utils.ReportChanges(types.AppliedUsageExampleAdded, report, pageId, newAppliedUsageExampleCount) + report = utils.ReportChanges(types.AppliedUsageExampleAdded, report, page.ID, newAppliedUsageExampleCount) } - newCodeNodeCount := newCodeNodes - if incomingCodeNodeCount != newCodeNodeCount { - report = utils.ReportIssues(types.CodeNodeCountIssue, report, pageId, incomingCodeNodeCount, newCodeNodeCount) + if page.CodeNodesTotal != newCodeNodeCount { + report = utils.ReportIssues(types.CodeNodeCountIssue, report, page.ID, page.CodeNodesTotal, newCodeNodeCount) } return report } diff --git a/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices.go b/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices.go index 814647b..ba9378c 100644 --- a/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices.go +++ b/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices.go @@ -18,10 +18,10 @@ import ( // This function attempts to assign a state to & appropriately handle every node. func CompareExistingIncomingCodeExampleSlices(existingNodes []common.CodeNode, existingRemovedNodes []common.CodeNode, incomingNodes []types.ASTNode, report types.ProjectReport, pageId string, llm *ollama.LLM, ctx context.Context, isDriversProject bool) ([]common.CodeNode, types.ProjectReport) { // These are page nodes that are a partial match for nodes on the page. We assume they are making updates to an existing node. - var updatedPageNodes []types.ASTNode + var updatedPageNodes []types.ASTNodeWrapper // These are incoming AST nodes that do not match any existing code nodes on the page. They are net new. - var newPageNodes []types.ASTNode + var newPageNodes []types.ASTNodeWrapper // These are existing code nodes from the database that match incoming AST nodes from the Snooty Data API. // They are exact matches that are unchanged. @@ -77,47 +77,49 @@ func CompareExistingIncomingCodeExampleSlices(existingNodes []common.CodeNode, e // Check to see if the incoming AST node hash is an exact match for an unmatched existing code node hash, and // the count is at least 1 if unmatchedSha256Hashes[hash] >= 1 { - // Get the matching code node and append it to the array of unchanged code nodes. We use this to construct the - // new array of nodes on the page. + // Get the matching code node unchangedCodeNode := unmatchedSha256ToCodeNodeMap[hash] - unchangedNodes = append(unchangedNodes, unchangedCodeNode) - // If this hash appears only once in the existing hashes, delete it from the hash list and map. We don't - // want to consider it as a possible match for other nodes. - if unmatchedSha256Hashes[hash] == 1 { - delete(unmatchedSha256Hashes, hash) - delete(unmatchedSha256ToCodeNodeMap, hash) - } else { - // If a sha256 hash appears more than once on a page, decrement one instance from the counter since - // we are counting it as a "match" here + + if unchangedCodeNode.InstancesOnPage != 0 && unchangedCodeNode.InstancesOnPage != count { + // If the unchanged code node does not match the count of number of times this hash appears, decrement + // one instance from the counter since we are counting it as a "match" here. we don't just want to + // delete it because it may also match an "update" later. + unmatchedSha256Hashes[hash]-- + } else if unchangedCodeNode.InstancesOnPage == 0 && unmatchedSha256Hashes[hash] > 1 { + // If `InstancesOnPage` is unitialized, we can't compare it with the hash count, so just decrement the hash count unmatchedSha256Hashes[hash]-- - } - // If the hash exists only once in the incoming AST node hash map, delete it from the hash list and map. - // Otherwise, decrement the count of times it appears. - if count == 1 { - delete(snootySha256Hashes, hash) - delete(snootySha256ToAstNodeMap, hash) } else { - snootySha256Hashes[hash]-- + // If it _does_ match the number of times the hash appears, consider it unchanged. Delete it from the + // unmatched hash list and map. Now that it has matched, we don't need to consider it as a possible + // match for other nodes. + delete(unmatchedSha256Hashes, hash) + delete(unmatchedSha256ToCodeNodeMap, hash) } + + // Update the count to reflect how many times it currently appears on the page + unchangedCodeNode.InstancesOnPage = count + + // Append it to the array of unchanged nodes. We use this to rebuild the array of code nodes we'll write to the DB. + unchangedNodes = append(unchangedNodes, unchangedCodeNode) + + // Delete it from the incoming hash list and map. + delete(snootySha256Hashes, hash) + delete(snootySha256ToAstNodeMap, hash) } } // Now start checking whether the remaining incoming AST nodes are updates or net new examples. for hash, count := range snootySha256Hashes { astNode := snootySha256ToAstNodeMap[hash] + nodePlusMetadata := types.ASTNodeWrapper{ + InstancesOnPage: count, + Node: astNode, + } // Figure out whether the AST node is new or updated. If it matches an existing code node in the DB, // this function returns the existing code node along with the string "newExample" or "updated". newOrUpdated, existingNode := CodeNewOrUpdated(unmatchedSha256ToCodeNodeMap, astNode) if newOrUpdated == newExample { - newPageNodes = append(newPageNodes, astNode) - // If the hash exists only once in the incoming AST node hash map, delete it from the hash list and map. - // Otherwise, decrement the count of times it appears. - if count == 1 { - delete(snootySha256Hashes, hash) - delete(snootySha256ToAstNodeMap, hash) - } else { - snootySha256Hashes[hash]-- - } + newPageNodes = append(newPageNodes, nodePlusMetadata) } else { if existingNode != nil { incomingUpdatedSha256ToCodeNodeMap[hash] = *existingNode @@ -135,7 +137,7 @@ func CompareExistingIncomingCodeExampleSlices(existingNodes []common.CodeNode, e unmatchedSha256Hashes[existingNode.SHA256Hash]-- } } - updatedPageNodes = append(updatedPageNodes, astNode) + updatedPageNodes = append(updatedPageNodes, nodePlusMetadata) } } diff --git a/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices_test.go b/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices_test.go index 419ff92..e5b6b5d 100644 --- a/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices_test.go +++ b/audit/gdcd/compare-code-examples/CompareExistingIncomingCodeExampleSlices_test.go @@ -288,3 +288,119 @@ func TestOneUpdatedOneUnchangedCodeExampleHandledCorrectly(t *testing.T) { fmt.Println("Removed code node count: ", updatedReport.Counter.RemovedCodeNodesCount) } } + +func TestDuplicateUpdatedCodeExampleHandledCorrectly(t *testing.T) { + existingNodes := []common.CodeNode{} + existingRemovedNodes := []common.CodeNode{} + incomingNodes := []types.ASTNode{} + existingNode, updatedASTNode := data.GetUpdatedNodes() + existingNodes = append(existingNodes, existingNode) + // Appending the same node twice to represent it as a duplicate on the page + incomingNodes = append(incomingNodes, updatedASTNode) + incomingNodes = append(incomingNodes, updatedASTNode) + initialReport := types.ProjectReport{ + ProjectName: "test-project", + Changes: nil, + Issues: nil, + Counter: types.ProjectCounts{}, + } + pageId := "some/page/url" + llm, err := ollama.New(ollama.WithModel(add_code_examples.MODEL)) + if err != nil { + log.Fatalf("failed to connect to ollama: %v", err) + } + ctx := context.Background() + nodes, updatedReport := CompareExistingIncomingCodeExampleSlices(existingNodes, existingRemovedNodes, incomingNodes, initialReport, pageId, llm, ctx, true) + + // For this test, we only want to store the code node once in the array. But we want `InstancesOnPage` to show + // that it is on the page twice. And we want the report to count it as two code examples. + if len(nodes) != 1 { + t.Errorf("FAILED: Got %d, want 1 code nodes", len(nodes)) + } + if nodes[0].InstancesOnPage != 2 { + t.Errorf("FAILED: Got %d, want 2 instances on page", nodes[0].InstancesOnPage) + } + if updatedReport.Counter.UpdatedCodeNodesCount != 2 { + t.Errorf("FAILED: Got %d, report should show 2 updated code nodes", len(nodes)) + fmt.Println("New code node count: ", updatedReport.Counter.NewCodeNodesCount) + fmt.Println("Updated code node count: ", updatedReport.Counter.UpdatedCodeNodesCount) + fmt.Println("Unchanged code node count: ", updatedReport.Counter.UnchangedCodeNodesCount) + fmt.Println("Removed code node count: ", updatedReport.Counter.RemovedCodeNodesCount) + } +} + +func TestDuplicateUnchangedCodeExampleHandledCorrectly(t *testing.T) { + existingNodes := []common.CodeNode{} + existingRemovedNodes := []common.CodeNode{} + incomingNodes := []types.ASTNode{} + existingNode, incomingUnchangedNode := data.GetUnchangedNodes() + existingNodes = append(existingNodes, existingNode) + // Appending the same node twice to represent it as a duplicate on the page + incomingNodes = append(incomingNodes, incomingUnchangedNode) + incomingNodes = append(incomingNodes, incomingUnchangedNode) + initialReport := types.ProjectReport{ + ProjectName: "test-project", + Changes: nil, + Issues: nil, + Counter: types.ProjectCounts{}, + } + pageId := "some/page/url" + llm, err := ollama.New(ollama.WithModel(add_code_examples.MODEL)) + if err != nil { + log.Fatalf("failed to connect to ollama: %v", err) + } + ctx := context.Background() + nodes, updatedReport := CompareExistingIncomingCodeExampleSlices(existingNodes, existingRemovedNodes, incomingNodes, initialReport, pageId, llm, ctx, true) + + // For this test, we only want to store the code node once in the array. But we want `InstancesOnPage` to show + // that it is on the page twice. And we want the report to count it as two code examples. + if len(nodes) != 1 { + t.Errorf("FAILED: Got %d, want 1 code nodes", len(nodes)) + } + if nodes[0].InstancesOnPage != 2 { + t.Errorf("FAILED: Got %d, want 2 instances on page", nodes[0].InstancesOnPage) + } + if updatedReport.Counter.UnchangedCodeNodesCount != 2 { + t.Errorf("FAILED: Got %d, report should show 2 unchanged code nodes", len(nodes)) + fmt.Println("New code node count: ", updatedReport.Counter.NewCodeNodesCount) + fmt.Println("Updated code node count: ", updatedReport.Counter.UpdatedCodeNodesCount) + fmt.Println("Unchanged code node count: ", updatedReport.Counter.UnchangedCodeNodesCount) + fmt.Println("Removed code node count: ", updatedReport.Counter.RemovedCodeNodesCount) + } +} + +func TestDuplicateNewCodeExampleHandledCorrectly(t *testing.T) { + existingNodes := []common.CodeNode{} + existingRemovedNodes := []common.CodeNode{} + incomingNodes := data.GetNewASTNodes(1) + incomingNodes = append(incomingNodes, incomingNodes...) + initialReport := types.ProjectReport{ + ProjectName: "test-project", + Changes: nil, + Issues: nil, + Counter: types.ProjectCounts{}, + } + pageId := "some/page/url" + llm, err := ollama.New(ollama.WithModel(add_code_examples.MODEL)) + if err != nil { + log.Fatalf("failed to connect to ollama: %v", err) + } + ctx := context.Background() + nodes, updatedReport := CompareExistingIncomingCodeExampleSlices(existingNodes, existingRemovedNodes, incomingNodes, initialReport, pageId, llm, ctx, true) + + // For this test, we only want to store the new code node once in the array. But we want `InstancesOnPage` to show + // that it is on the page twice. And we want the report to count it as two code examples. + if len(nodes) != 1 { + t.Errorf("FAILED: Got %d, want 1 code nodes", len(nodes)) + } + if nodes[0].InstancesOnPage != 2 { + t.Errorf("FAILED: Got %d, want 2 instances on page", nodes[0].InstancesOnPage) + } + if updatedReport.Counter.NewCodeNodesCount != 2 { + t.Errorf("FAILED: Got %d, report should show 2 new code nodes", len(nodes)) + fmt.Println("New code node count: ", updatedReport.Counter.NewCodeNodesCount) + fmt.Println("Updated code node count: ", updatedReport.Counter.UpdatedCodeNodesCount) + fmt.Println("Unchanged code node count: ", updatedReport.Counter.UnchangedCodeNodesCount) + fmt.Println("Removed code node count: ", updatedReport.Counter.RemovedCodeNodesCount) + } +} diff --git a/audit/gdcd/compare-code-examples/GetCodeNodeCount.go b/audit/gdcd/compare-code-examples/GetCodeNodeCount.go new file mode 100644 index 0000000..6ae2105 --- /dev/null +++ b/audit/gdcd/compare-code-examples/GetCodeNodeCount.go @@ -0,0 +1,18 @@ +package compare_code_examples + +import "common" + +func GetCodeNodeCount(codeNodes []common.CodeNode) int { + count := 0 + for _, codeNode := range codeNodes { + // If the `InstancesOnPage` field is initialized, it should have a count, and we add it to the count. If it's + // not initialized, Go default initializes int values to 0, so its value should count as 0 here. In that case, + // just increment the count by 1. + if codeNode.InstancesOnPage != 0 { + count += codeNode.InstancesOnPage + } else { + count++ + } + } + return count +} diff --git a/audit/gdcd/compare-code-examples/HandleNewPageNodes.go b/audit/gdcd/compare-code-examples/HandleNewPageNodes.go index 50123ff..5a599cb 100644 --- a/audit/gdcd/compare-code-examples/HandleNewPageNodes.go +++ b/audit/gdcd/compare-code-examples/HandleNewPageNodes.go @@ -11,11 +11,18 @@ import ( // HandleNewPageNodes creates a slice of new CodeNode objects from new ASTNode objects, and hands it back to the call site. // We append all the "Handle" function results to an array, and overwrite the document in the DB with the updated code nodes. -func HandleNewPageNodes(newIncomingPageNodes []types.ASTNode, llm *ollama.LLM, ctx context.Context, isDriversProject bool) []common.CodeNode { +func HandleNewPageNodes(newIncomingPageNodes []types.ASTNodeWrapper, llm *ollama.LLM, ctx context.Context, isDriversProject bool) ([]common.CodeNode, int) { newNodes := make([]common.CodeNode, 0) + newCodeNodeCount := 0 for _, incomingNode := range newIncomingPageNodes { - newNode := snooty.MakeCodeNodeFromSnootyAST(incomingNode, llm, ctx, isDriversProject) + newNode := snooty.MakeCodeNodeFromSnootyAST(incomingNode.Node, llm, ctx, isDriversProject) + if incomingNode.InstancesOnPage > 1 { + newNode.InstancesOnPage = incomingNode.InstancesOnPage + newCodeNodeCount += incomingNode.InstancesOnPage + } else { + newCodeNodeCount++ + } newNodes = append(newNodes, newNode) } - return newNodes + return newNodes, newCodeNodeCount } diff --git a/audit/gdcd/compare-code-examples/HandleNewPageNodes_test.go b/audit/gdcd/compare-code-examples/HandleNewPageNodes_test.go index 7823d77..0ba24d3 100644 --- a/audit/gdcd/compare-code-examples/HandleNewPageNodes_test.go +++ b/audit/gdcd/compare-code-examples/HandleNewPageNodes_test.go @@ -4,6 +4,7 @@ import ( "context" add_code_examples "gdcd/add-code-examples" "gdcd/compare-code-examples/data" + "gdcd/types" "github.com/tmc/langchaingo/llms/ollama" "log" "testing" @@ -13,12 +14,16 @@ import ( // make any assertions about the values because the code that actually creates the code nodes is tested in the 'add-code-examples' package. func TestHandleNewPageNodesCreatesOneNode(t *testing.T) { astNode := data.GetNewASTNodes(1) + astNodeWrapper := types.ASTNodeWrapper{ + InstancesOnPage: 1, + Node: astNode[0], + } llm, err := ollama.New(ollama.WithModel(add_code_examples.MODEL)) if err != nil { log.Fatalf("failed to connect to ollama: %v", err) } ctx := context.Background() - codeNodes := HandleNewPageNodes(astNode, llm, ctx, true) + codeNodes, _ := HandleNewPageNodes([]types.ASTNodeWrapper{astNodeWrapper}, llm, ctx, true) if len(codeNodes) != 1 { t.Errorf("FAILED: Should have 1 new code node") } @@ -26,12 +31,20 @@ func TestHandleNewPageNodesCreatesOneNode(t *testing.T) { func TestHandleNewPageNodesCreatesMultipleNodes(t *testing.T) { astNodes := data.GetNewASTNodes(2) + astNodeWrapper1 := types.ASTNodeWrapper{ + InstancesOnPage: 1, + Node: astNodes[0], + } + astNodeWrapper2 := types.ASTNodeWrapper{ + InstancesOnPage: 1, + Node: astNodes[1], + } llm, err := ollama.New(ollama.WithModel(add_code_examples.MODEL)) ctx := context.Background() if err != nil { log.Fatalf("failed to connect to ollama: %v", err) } - codeNodes := HandleNewPageNodes(astNodes, llm, ctx, true) + codeNodes, _ := HandleNewPageNodes([]types.ASTNodeWrapper{astNodeWrapper1, astNodeWrapper2}, llm, ctx, true) if len(codeNodes) != 2 { t.Errorf("FAILED: Should have 2 new code node") } diff --git a/audit/gdcd/compare-code-examples/HandleRemovedCodeNodes.go b/audit/gdcd/compare-code-examples/HandleRemovedCodeNodes.go index 5f937f3..098742d 100644 --- a/audit/gdcd/compare-code-examples/HandleRemovedCodeNodes.go +++ b/audit/gdcd/compare-code-examples/HandleRemovedCodeNodes.go @@ -10,12 +10,20 @@ import ( // updated code nodes. We don't just remove the nodes directly because we want to maintain a count of codes that we // have removed - i.e. if we remove removed nodes, and add new nodes, the count stays the same and we can't track // net new code examples. -func HandleRemovedCodeNodes(removedCodeNodes []common.CodeNode) []common.CodeNode { +func HandleRemovedCodeNodes(removedCodeNodes []common.CodeNode) ([]common.CodeNode, int) { updatedRemovedNodes := make([]common.CodeNode, 0) + updatedRemovedCodeNodeCount := 0 for _, node := range removedCodeNodes { node.IsRemoved = true - node.DateUpdated = time.Now() + node.DateRemoved = time.Now() + if node.InstancesOnPage > 1 { + updatedRemovedCodeNodeCount += node.InstancesOnPage + node.InstancesOnPage = 0 + } else { + updatedRemovedCodeNodeCount++ + } + node.InstancesOnPage = 0 updatedRemovedNodes = append(updatedRemovedNodes, node) } - return updatedRemovedNodes + return updatedRemovedNodes, updatedRemovedCodeNodeCount } diff --git a/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes.go b/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes.go index 32d5ed4..ee5d393 100644 --- a/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes.go +++ b/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes.go @@ -13,16 +13,23 @@ import ( // up the matching code node, update the Code field text, add the new SHA256Hash, and append an updated date. We return // the updated []common.CodeNode array. We append all the "Handle" function results to a slice, and overwrite the // document in the DB with the updated code nodes. -func HandleUpdatedPageNodes(updatedPageNodes []types.ASTNode, incomingSha256ToCodeNodesMap map[string]common.CodeNode) []common.CodeNode { +func HandleUpdatedPageNodes(updatedPageNodes []types.ASTNodeWrapper, incomingSha256ToCodeNodesMap map[string]common.CodeNode) ([]common.CodeNode, int) { codeNodeUpdates := make([]common.CodeNode, 0) + updatedCodeNodeCount := 0 for _, incomingNode := range updatedPageNodes { - whiteSpaceTrimmedString := strings.TrimSpace(incomingNode.Value) + whiteSpaceTrimmedString := strings.TrimSpace(incomingNode.Node.Value) hash := snooty.MakeSha256HashForCode(whiteSpaceTrimmedString) codeNode := incomingSha256ToCodeNodesMap[hash] codeNode.Code = whiteSpaceTrimmedString codeNode.SHA256Hash = hash codeNode.DateUpdated = time.Now() + if incomingNode.InstancesOnPage > 1 { + codeNode.InstancesOnPage = incomingNode.InstancesOnPage + updatedCodeNodeCount += incomingNode.InstancesOnPage + } else { + updatedCodeNodeCount++ + } codeNodeUpdates = append(codeNodeUpdates, codeNode) } - return codeNodeUpdates + return codeNodeUpdates, updatedCodeNodeCount } diff --git a/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes_test.go b/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes_test.go index 88999e2..0bea80a 100644 --- a/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes_test.go +++ b/audit/gdcd/compare-code-examples/HandleUpdatedPageNodes_test.go @@ -12,11 +12,15 @@ import ( func TestHandleUpdatedPageNodesCorrectlyUpdatesValues(t *testing.T) { codeNode, astNode := data.GetUpdatedNodes() + astNodeWrapper := types.ASTNodeWrapper{ + InstancesOnPage: 1, + Node: astNode, + } sha256HashCodeNodeLookupMap := make(map[string]common.CodeNode) whitespaceTrimmedString := strings.TrimSpace(astNode.Value) incomingSha26Hash := snooty.MakeSha256HashForCode(whitespaceTrimmedString) sha256HashCodeNodeLookupMap[incomingSha26Hash] = codeNode - updatedCodeNodes := HandleUpdatedPageNodes([]types.ASTNode{astNode}, sha256HashCodeNodeLookupMap) + updatedCodeNodes, _ := HandleUpdatedPageNodes([]types.ASTNodeWrapper{astNodeWrapper}, sha256HashCodeNodeLookupMap) updatedCodeNode := updatedCodeNodes[0] if updatedCodeNode.SHA256Hash != incomingSha26Hash { t.Errorf("FAILED: got %s on the code node hash, want %s", updatedCodeNode.SHA256Hash, incomingSha26Hash) diff --git a/audit/gdcd/compare-code-examples/MakeUpdatedCodeNodesArray.go b/audit/gdcd/compare-code-examples/MakeUpdatedCodeNodesArray.go index b70fa97..389d1ec 100644 --- a/audit/gdcd/compare-code-examples/MakeUpdatedCodeNodesArray.go +++ b/audit/gdcd/compare-code-examples/MakeUpdatedCodeNodesArray.go @@ -15,9 +15,9 @@ import ( func MakeUpdatedCodeNodesArray(removedCodeNodes []common.CodeNode, existingRemovedCodeNodes []common.CodeNode, unchangedPageNodes []common.CodeNode, - updatedPageNodes []types.ASTNode, + updatedPageNodes []types.ASTNodeWrapper, updatedPageNodesSha256CodeNodeLookup map[string]common.CodeNode, - newPageNodes []types.ASTNode, + newPageNodes []types.ASTNodeWrapper, incomingCount int, report types.ProjectReport, pageId string, @@ -30,9 +30,11 @@ func MakeUpdatedCodeNodesArray(removedCodeNodes []common.CodeNode, newAppliedUsageExampleCounts := 0 // Call all the 'Handle' functions in sequence - updatedCodeNodes := HandleUpdatedPageNodes(updatedPageNodes, updatedPageNodesSha256CodeNodeLookup) - newCodeNodes := HandleNewPageNodes(newPageNodes, llm, ctx, isDriversProject) - removedCodeNodesUpdatedForRemoval := HandleRemovedCodeNodes(removedCodeNodes) + updatedCodeNodes, updatedCodeNodeCount := HandleUpdatedPageNodes(updatedPageNodes, updatedPageNodesSha256CodeNodeLookup) + + newCodeNodes, newCodeNodeCount := HandleNewPageNodes(newPageNodes, llm, ctx, isDriversProject) + + removedCodeNodesUpdatedForRemoval, newRemovedCodeNodesCount := HandleRemovedCodeNodes(removedCodeNodes) if len(newCodeNodes) > 0 { for _, node := range newCodeNodes { @@ -42,14 +44,18 @@ func MakeUpdatedCodeNodesArray(removedCodeNodes []common.CodeNode, } } + unchangedPageNodeCount := GetCodeNodeCount(unchangedPageNodes) + // Make the updated []types.CodeNode array aggregateCodeNodeChanges = append(aggregateCodeNodeChanges, unchangedPageNodes...) aggregateCodeNodeChanges = append(aggregateCodeNodeChanges, updatedCodeNodes...) aggregateCodeNodeChanges = append(aggregateCodeNodeChanges, newCodeNodes...) aggregateCodeNodeChanges = append(aggregateCodeNodeChanges, removedCodeNodesUpdatedForRemoval...) + aggregateCodeNodeCount := GetCodeNodeCount(aggregateCodeNodeChanges) + // Increment project counters - report = UpdateProjectReportForUpdatedCodeNodes(report, pageId, incomingCount, len(unchangedPageNodes), len(updatedCodeNodes), len(newCodeNodes), len(removedCodeNodesUpdatedForRemoval), len(aggregateCodeNodeChanges), newAppliedUsageExampleCounts) + report = UpdateProjectReportForUpdatedCodeNodes(report, pageId, incomingCount, unchangedPageNodeCount, updatedCodeNodeCount, newCodeNodeCount, newRemovedCodeNodesCount, aggregateCodeNodeCount, newAppliedUsageExampleCounts) // We don't want to report on any of the removed code nodes that were already on the page, but we do want to keep them // around, so append them to the Nodes array after adding and reporting on the new stuff aggregateCodeNodeChanges = append(aggregateCodeNodeChanges, existingRemovedCodeNodes...) diff --git a/audit/gdcd/compare-code-examples/IncrementProjectCounter.go b/audit/gdcd/compare-code-examples/UpdateProjectReportForUpdatedCodeNodes.go similarity index 100% rename from audit/gdcd/compare-code-examples/IncrementProjectCounter.go rename to audit/gdcd/compare-code-examples/UpdateProjectReportForUpdatedCodeNodes.go diff --git a/audit/gdcd/compare-code-examples/IncrementProjectCounter_test.go b/audit/gdcd/compare-code-examples/UpdateProjectReportForUpdatedCodeNodes_test.go similarity index 100% rename from audit/gdcd/compare-code-examples/IncrementProjectCounter_test.go rename to audit/gdcd/compare-code-examples/UpdateProjectReportForUpdatedCodeNodes_test.go diff --git a/audit/gdcd/db/HandleMissingPageIds.go b/audit/gdcd/db/HandleMissingPageIds.go index 8ad9774..a76fdfb 100644 --- a/audit/gdcd/db/HandleMissingPageIds.go +++ b/audit/gdcd/db/HandleMissingPageIds.go @@ -3,30 +3,82 @@ package db import ( "gdcd/types" "gdcd/utils" + "strings" ) -func HandleMissingPageIds(collectionName string, incomingPageIds map[string]bool, report types.ProjectReport) types.ProjectReport { +// HandleMissingPageIds gets a list of all the page IDs from Atlas, compares each page ID against incoming ones coming +// in from Snooty, and tries to figure out whether existing IDs that do not match incoming ones are moved pages or removed +// pages. If the page is removed, we delete it from the DB. We pass moved and new pages back to the call site for further +// handling. +func HandleMissingPageIds(collectionName string, incomingPageIds map[string]bool, maybeNewPages []types.NewOrMovedPage, report types.ProjectReport) (types.ProjectReport, []types.NewOrMovedPage, []types.NewOrMovedPage) { + var movedPages []types.NewOrMovedPage // Get a slice of all the page IDs for pages that are currently in Atlas existingPageIds := GetAtlasPageIDs(collectionName) // If we don't get any page IDs from Atlas, just return the unmodified report if existingPageIds == nil { - return report + return report, maybeNewPages, movedPages } // Compare the pages that are currently in Atlas with pages coming in from the Snooty Data API. If the page exists // in Atlas but isn't coming in from the Snooty Data API, grab the ID so we can remove the page in Atlas. - // TODO: There may be a logic issue here. When we could not retrieve the page ID from the DB; the page was getting - // deleted. That suggests some logic is backward here, but I can't see a logic issue. Revisit if this still appears - // to be a problem now that the DB retrieval func has retry logic. (And/or add testing for this!) - var pageIdsWithNoMatchingSnootyPage []string + var maybeRemovedPageIds []string for _, existingId := range existingPageIds { if incomingPageIds[existingId] { // If the page ID in Atlas matches an incoming page ID from Snooty matches, skip the rest of the loop continue } - // If an existing ID in Atlas does not match any of the pages coming in from Snooty, add the ID to a list of pages we should delete - pageIdsWithNoMatchingSnootyPage = append(pageIdsWithNoMatchingSnootyPage, existingId) + // If an existing ID in Atlas does not match any of the pages coming in from Snooty, add the ID to a list of pages that might be removed + maybeRemovedPageIds = append(maybeRemovedPageIds, existingId) } - for _, pageIdToDelete := range pageIdsWithNoMatchingSnootyPage { + + var pageIdsToDelete []string + + // A page ID that isn't an exact match for one coming in from Snooty could be either a moved page or a removed page + for _, maybeRemovedPageId := range maybeRemovedPageIds { + existingPage := GetAtlasPageData(collectionName, maybeRemovedPageId) + pageIsMoved := false + + // Compare the removed page against the unaccounted for pages in the collection. An incoming page that + // does not have a matching page ID could be either moved or new. If the count of code examples, literalincludes, + // and io-code-blocks exactly matches a removed page, we'll call it "moved" instead of "new" + for index, maybeNewPage := range maybeNewPages { + // If the page IDs share a page name, they might be the same page + pageIdsOverlap := checkIfPageIdsOverlap(existingPage.ID, maybeNewPage.PageId) + + // If the count of code examples is exactly the same, *and* that count is not 0, they might be the same page + codeNodeCountMatches := maybeNewPage.CodeNodeCount == existingPage.CodeNodesTotal && maybeNewPage.CodeNodeCount != 0 + + // To be more precise, also check if the count of literalincludes and io-code-blocks match + literalIncludeCountMatches := maybeNewPage.LiteralIncludeCount == existingPage.LiteralIncludesTotal + ioCodeBlockCountMatches := maybeNewPage.IoCodeBlockCount == existingPage.IoCodeBlocksTotal + + // If the page name shares common elements, all three counts match, and the code node count is not 0, + // consider it a moved page instead of new & removed pages + if pageIdsOverlap && codeNodeCountMatches && literalIncludeCountMatches && ioCodeBlockCountMatches { + maybeNewPage.NewPageId = maybeNewPage.PageId + maybeNewPage.OldPageId = existingPage.ID + maybeNewPage.DateAdded = existingPage.DateAdded + movedPages = append(movedPages, maybeNewPage) + + // If we find a match, we can remove it from the `maybeNewPages` slice so we don't attempt to match it again + // Anything left in the `maybeNewPages` slice after comparing all the maybe removed pages is net new, so + // we'll pass it back to the call site to handle it as a new page + maybeNewPages = removeMovedPage(maybeNewPages, index) + pageIsMoved = true + + // We've found a match, so we can skip the rest of the `maybeNewPages` for this `maybeRemovedPageId` + break + } + } + + // If we have gone through all the maybe new pages, and none is an exact match in code example counts, consider + // it a removed page + if !pageIsMoved { + pageIdsToDelete = append(pageIdsToDelete, maybeRemovedPageId) + } + } + + // Handle all the removed pages + for _, pageIdToDelete := range pageIdsToDelete { // We want to report details for the page we're about to delete, so we need to pull up the page to get the details existingPage := GetAtlasPageData(collectionName, pageIdToDelete) codeNodeCount := existingPage.CodeNodesTotal @@ -50,5 +102,109 @@ func HandleMissingPageIds(collectionName string, incomingPageIds map[string]bool report = utils.ReportIssues(types.PageNotRemovedIssue, report, pageIdToDelete) } } - return report + + // Anything left in the `maybeNewPages` slice at this point is net new, so we'll handle it back at the call site + // Anything in the `movedPages` slice is moved, which we'll also handle back at the call site + return report, maybeNewPages, movedPages +} + +// checkIfPageIdsOverlap does a couple of types of comparison between the old page ID and the new page ID to determine +// if they "match". +func checkIfPageIdsOverlap(oldPageId string, newPageId string) bool { + // First, we want to get the page title. Split by `|`. The final element will be the page title. + // i.e. in the page ID `tutorial|create-mongodb-user-for-cluster` - the final element after the `|`, + // `create-mongodb-user-for-cluster` - is what we're considering the page title + oldPageName := getPageTitleFromId(oldPageId) + newPageName := getPageTitleFromId(newPageId) + newPageSegments := getExtendedPageTitleFromId(newPageName) + + // The simplest case is a restructure that moves the pages from one directory to another without any changes. + // If the page name is an exact match, we can return true, because the page title overlaps 100% + if oldPageName == newPageName { + return true + // In some cases, the page may have become a title page for a section, and may now have pages below it. Check + // if the old page name is up a directory level. + } else if contains(newPageSegments, oldPageName) { + return true + } else { + // If it's not a 1:1 move the page without changing the title situation, we can compare the page titles to try + // to figure out if it has enough overlap to be effectively the same page title + return pageNamesHaveCommonElements(oldPageName, newPageName) + } +} + +func getPageTitleFromId(pageId string) string { + parts := strings.Split(pageId, "|") + + // Get the last element + if len(parts) > 0 { + lastElement := parts[len(parts)-1] // Access the last index + return lastElement + } else { + return "" + } +} + +func getExtendedPageTitleFromId(pageId string) []string { + parts := strings.Split(pageId, "|") + + var titleSegments []string + // Get the last element + if len(parts) > 0 { + lastElement := parts[len(parts)-1] // Access the last index + titleSegments = append(titleSegments, lastElement) + } + // If there are multiple elements, get the second-to-last element. This may contain something that _used_ to match + // the page ID when we are now nesting pages below it + if len(parts) > 1 { + secondToLastElement := parts[len(parts)-2] + titleSegments = append(titleSegments, secondToLastElement) + } + return titleSegments +} + +func pageNamesHaveCommonElements(oldPageName string, newPageName string) bool { + // Split the page names by `-` to get the words in the name for common comparison + oldPageNameParts := strings.Split(oldPageName, "-") + newPageNameParts := strings.Split(newPageName, "-") + + // We don't want to count irrelevant words for this comparison, so compare elements against these words and omit + // them from being counted as an overlap + ignoreWords := []string{"and", "or", "by", "for", "the", "in"} + + oldPageNameElements := make(map[string]bool) + for _, element := range oldPageNameParts { + oldPageNameElements[element] = true // Mark the presence of each element in the map + } + + // Compare with `newPageNameParts` and count common elements + commonCount := 0 + for _, value := range newPageNameParts { + if oldPageNameElements[value] { // Check if the element exists in the map + // Confirm the element isn't one of the ignore words + if !contains(ignoreWords, value) { + // If it's not an ignore word, consider it a common element + commonCount++ + } + } + } + + if commonCount > 0 { + return true + } else { + return false + } +} + +func contains(slice []string, str string) bool { + for _, value := range slice { + if value == str { + return true // Return true if the string is found + } + } + return false // Return false if the string is not found +} + +func removeMovedPage(maybeNewPages []types.NewOrMovedPage, index int) []types.NewOrMovedPage { + return append(maybeNewPages[:index], maybeNewPages[index+1:]...) } diff --git a/audit/gdcd/db/RemovePageFromAtlas.go b/audit/gdcd/db/RemovePageFromAtlas.go index 6eda656..1f2cd34 100644 --- a/audit/gdcd/db/RemovePageFromAtlas.go +++ b/audit/gdcd/db/RemovePageFromAtlas.go @@ -42,9 +42,15 @@ func RemovePageFromAtlas(collectionName string, pageId string) bool { if err != nil { log.Printf("Failed to delete MongoDB document for pageId %s: %v\n", pageId, err) } - if deleteResult.DeletedCount == 1 { - return true + if deleteResult != nil { + if deleteResult.DeletedCount == 1 { + return true + } else { + log.Printf("Attmpted to delete MongoDB document for pageId %s and got this deleted count: %v\n", pageId, deleteResult.DeletedCount) + return false + } } else { + log.Printf("Attempted to delete MongoDB document for pageId %s but the delete result was nil.\n", pageId) return false } } diff --git a/audit/gdcd/types/ASTNodeWrapper.go b/audit/gdcd/types/ASTNodeWrapper.go new file mode 100644 index 0000000..9eee31a --- /dev/null +++ b/audit/gdcd/types/ASTNodeWrapper.go @@ -0,0 +1,6 @@ +package types + +type ASTNodeWrapper struct { + InstancesOnPage int + Node ASTNode +} diff --git a/audit/gdcd/types/NewOrMovedPage.go b/audit/gdcd/types/NewOrMovedPage.go new file mode 100644 index 0000000..97b1178 --- /dev/null +++ b/audit/gdcd/types/NewOrMovedPage.go @@ -0,0 +1,14 @@ +package types + +import "time" + +type NewOrMovedPage struct { + PageId string + CodeNodeCount int + LiteralIncludeCount int + IoCodeBlockCount int + PageData PageMetadata + OldPageId string + NewPageId string + DateAdded time.Time +} diff --git a/audit/gdcd/types/Report.go b/audit/gdcd/types/Report.go index 223bff8..74a14d6 100644 --- a/audit/gdcd/types/Report.go +++ b/audit/gdcd/types/Report.go @@ -25,6 +25,7 @@ const ( // Define the possible types of changes. PageCreated ChangeType = iota PageUpdated + PageMoved PageRemoved KeywordsUpdated CodeExampleCreated @@ -59,7 +60,7 @@ type Issue struct { // String returns a string representation of the ChangeType for easier readability. func (ct ChangeType) String() string { - return [...]string{"Page created", "Page updated", "Page removed", "Keywords updated", "Code example created", "Code example updated", "Code example removed", "Code node count change", "literalinclude count change", "io-code-block count change", "Project summary node count change", "Project summary page count change", "Applied usage example added"}[ct] + return [...]string{"Page created", "Page updated", "Page moved", "Page removed", "Keywords updated", "Code example created", "Code example updated", "Code example removed", "Code node count change", "literalinclude count change", "io-code-block count change", "Project summary node count change", "Project summary page count change", "Applied usage example added"}[ct] } // String returns a string representation of the IssueType for easier readability. diff --git a/audit/gdcd/utils/ConvertAtlasPageIdToProductionUrl.go b/audit/gdcd/utils/ConvertAtlasPageIdToProductionUrl.go new file mode 100644 index 0000000..f7b8de5 --- /dev/null +++ b/audit/gdcd/utils/ConvertAtlasPageIdToProductionUrl.go @@ -0,0 +1,12 @@ +package utils + +import ( + "strings" +) + +func ConvertAtlasPageIdToProductionUrl(pageId string, siteUrl string) string { + // The Atlas ID has `|`-separated segments. Replace with `/` to use it in a URL. + pageIdAsUrlSegments := strings.ReplaceAll(pageId, "|", "/") + // Append the page path to the production site URL + return siteUrl + "/" + pageIdAsUrlSegments +} diff --git a/audit/gdcd/utils/ReportChanges.go b/audit/gdcd/utils/ReportChanges.go index a9ad512..d8d45d2 100644 --- a/audit/gdcd/utils/ReportChanges.go +++ b/audit/gdcd/utils/ReportChanges.go @@ -25,6 +25,8 @@ func ReportChanges(changeType types.ChangeType, report types.ProjectReport, stri message = fmt.Sprintf("Page ID: %s", stringArg) case types.PageUpdated: message = fmt.Sprintf("Page ID: %s", stringArg) + case types.PageMoved: + message = fmt.Sprintf("%s", stringArg) case types.PageRemoved: message = fmt.Sprintf("Page ID: %s", stringArg) case types.KeywordsUpdated: