Skip to content

(DOCSP-49695): Handle moved pages #54

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions audit/common/CodeNode.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import "time"

// CodeNode captures metadata about a specific code example. The `Code` field contains the example itself.
type CodeNode struct {
Code string `bson:"code"`
Language string `bson:"language"`
FileExtension string `bson:"file_extension"`
Category string `bson:"category"`
SHA256Hash string `bson:"sha_256_hash"`
LLMCategorized bool `bson:"llm_categorized"`
DateAdded time.Time `bson:"date_added"`
DateUpdated time.Time `bson:"date_updated,omitempty"`
DateRemoved time.Time `bson:"date_removed,omitempty"`
IsRemoved bool `bson:"is_removed,omitempty"`
Code string `bson:"code"`
Language string `bson:"language"`
FileExtension string `bson:"file_extension"`
Category string `bson:"category"`
SHA256Hash string `bson:"sha_256_hash"`
LLMCategorized bool `bson:"llm_categorized"`
DateAdded time.Time `bson:"date_added"`
DateUpdated time.Time `bson:"date_updated,omitempty"`
DateRemoved time.Time `bson:"date_removed,omitempty"`
IsRemoved bool `bson:"is_removed,omitempty"`
InstancesOnPage int `bson:"instances_on_page,omitempty"`
}
95 changes: 84 additions & 11 deletions audit/gdcd/CheckPagesForUpdates.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ package main
import (
"common"
"context"
"fmt"
"gdcd/db"
"gdcd/snooty"
"gdcd/types"
"gdcd/utils"
"time"

"github.com/tmc/langchaingo/llms/ollama"
)
Expand All @@ -17,8 +20,13 @@ import (
func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetails, llm *ollama.LLM, ctx context.Context, report types.ProjectReport) {
incomingPageIdsMatchingExistingPages := make(map[string]bool)
incomingDeletedPageCount := 0
var newPageIds []string
var newPages []common.DocsPage

// When a page doesn't match one in the DB, it could be either net new or a moved page. Hold it in a temp array
// for comparison
var maybeNewPages []types.NewOrMovedPage
var newPages []types.NewOrMovedPage
var newPageDBEntries []common.DocsPage
var movedPages []types.NewOrMovedPage
var updatedPages []common.DocsPage
for _, page := range pages {
// The Snooty Data API returns pages that may have been deleted. If the page is deleted, we want to check and see
Expand All @@ -27,6 +35,7 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail
if page.Data.Deleted {
report = HandleDeletedIncomingPages(project.ProjectName, page, report)
incomingDeletedPageCount++
utils.UpdateSecondaryTarget()
} else {
maybeExistingPage := CheckForExistingPage(project.ProjectName, page)
if maybeExistingPage != nil {
Expand All @@ -39,20 +48,72 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail
if updatedPage != nil {
updatedPages = append(updatedPages, *updatedPage)
}
utils.UpdateSecondaryTarget()
} else {
// If there is no existing document in Atlas that matches the page, we need to make a new page
var newPage common.DocsPage
newPage, report = MakeNewPage(page, project.ProdUrl, report, llm, ctx)
newPageIds = append(newPageIds, newPage.ID)
newPages = append(newPages, newPage)
// If there is no existing document in Atlas that matches the page, we need to make a new page. BUT!
// It might actually be a new or moved page. So store it in a temp `maybeNewPages` slice so we can compare
// it against removed pages later and potentially call it a "moved" page, instead.
newOrMovedPage := getNewOrMovedPageDetails(page.Data)
maybeNewPages = append(maybeNewPages, newOrMovedPage)
}
}
utils.UpdateSecondaryTarget()
}

// After iterating through the incoming pages from the Snooty Data API, we need to figure out if any of the page IDs
// we had in the DB are not coming in from the incoming response. If so, we should delete those entries.
report = db.HandleMissingPageIds(project.ProjectName, incomingPageIdsMatchingExistingPages, report)
// we had in the DB are not coming in from the incoming response. If so, those pages are either moved or removed.
report, newPages, movedPages = db.HandleMissingPageIds(project.ProjectName, incomingPageIdsMatchingExistingPages, maybeNewPages, report)

// If we have new pages, create the corresponding DocsPage and increment the project report for them
if newPages != nil {
for _, page := range newPages {
newPage := MakeNewPage(page.PageData, project.ProjectName, project.ProdUrl, llm, ctx)
newPageDBEntries = append(newPageDBEntries, newPage)
report = UpdateProjectReportForNewPage(newPage, report)
utils.UpdateSecondaryTarget()
}
}

// If we have moved pages, handle them
if movedPages != nil {
for _, page := range movedPages {
var movedPage common.DocsPage
oldPage := db.GetAtlasPageData(project.ProjectName, page.OldPageId)

if oldPage != nil {
movedPage = *oldPage
movedPage.ID = page.NewPageId
newPageUrl := utils.ConvertAtlasPageIdToProductionUrl(page.NewPageId, project.ProdUrl)
movedPage.DateLastUpdated = time.Now()
movedPage.PageURL = newPageUrl
} else {
movedPage = MakeNewPage(page.PageData, project.ProjectName, project.ProdUrl, llm, ctx)
movedPage.DateAdded = page.DateAdded
}

// Remove the old page from the DB
db.RemovePageFromAtlas(project.ProjectName, page.OldPageId)

// Append the "moved" page to the `newPageDBEntries` array. Because the page ID doesn't match the old one,
// we write it to the DB as a new page. Because we just deleted the old page, it works out to the same count
// and provides the up-to-date data in the DB.
newPageDBEntries = append(newPageDBEntries, movedPage)

incomingAstCodeNodes, incomingAstLiteralIncludeNodes, incomingAstIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(page.PageData.AST)
incomingAstCodeNodeCount := len(incomingAstCodeNodes)
incomingAstLiteralIncludeNodesCount := len(incomingAstLiteralIncludeNodes)
incomingAstIoCodeBlockNodesCount := len(incomingAstIoCodeBlockNodes)
// Update the project counts for the "existing" page
report = IncrementProjectCountsForExistingPage(incomingAstCodeNodeCount, incomingAstLiteralIncludeNodesCount, incomingAstIoCodeBlockNodesCount, movedPage, report)

// Report it in the logs as a moved page
stringMessageForReport := fmt.Sprintf("Old page ID: %s, new page ID: %s", page.OldPageId, page.NewPageId)
report = utils.ReportChanges(types.PageMoved, report, stringMessageForReport)
if movedPage.CodeNodesTotal != incomingAstCodeNodeCount {
utils.ReportIssues(types.CodeNodeCountIssue, report, page.NewPageId, page.CodeNodeCount, len(incomingAstCodeNodes))
}
utils.UpdateSecondaryTarget()
}
}

// Get the existing "summaries" document from the DB, and update it.
var summaryDoc common.CollectionReport
Expand All @@ -65,5 +126,17 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail
LogReportForProject(project.ProjectName, report)

// At this point, we have all the new and updated pages and an updated summary. Write updates to Atlas.
db.BatchUpdateCollection(project.ProjectName, newPages, updatedPages, summaryDoc)
db.BatchUpdateCollection(project.ProjectName, newPageDBEntries, updatedPages, summaryDoc)
}

func getNewOrMovedPageDetails(metadata types.PageMetadata) types.NewOrMovedPage {
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(metadata.AST)
pageId := utils.ConvertSnootyPageIdToAtlasPageId(metadata.PageID)
return types.NewOrMovedPage{
PageId: pageId,
CodeNodeCount: len(incomingCodeNodes),
LiteralIncludeCount: len(incomingLiteralIncludeNodes),
IoCodeBlockCount: len(incomingIoCodeBlockNodes),
PageData: metadata,
}
}
21 changes: 16 additions & 5 deletions audit/gdcd/MakeLanguagesArray.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,23 @@ func MakeLanguagesArray(codeNodes []common.CodeNode, literalIncludeNodes []types
continue
} else {
if languageCounts, exists := languages[node.Language]; exists {
languageCounts.Total += 1
languages[node.Language] = languageCounts
if node.InstancesOnPage > 0 {
languageCounts.Total += node.InstancesOnPage
languages[node.Language] = languageCounts
} else {
languageCounts.Total += 1
languages[node.Language] = languageCounts
}
} else {
countsForLang := languages[common.Undefined]
countsForLang.LiteralIncludes += 1
languages[common.Undefined] = countsForLang
if node.InstancesOnPage > 0 {
countsForLang := languages[common.Undefined]
countsForLang.Total += node.InstancesOnPage
languages[common.Undefined] = countsForLang
} else {
countsForLang := languages[common.Undefined]
countsForLang.Total += 1
languages[common.Undefined] = countsForLang
}
}
}
}
Expand Down
51 changes: 33 additions & 18 deletions audit/gdcd/MakeNewPage.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package main
import (
"common"
"context"
add_code_examples "gdcd/add-code-examples"
"gdcd/snooty"
"gdcd/types"
"gdcd/utils"
Expand All @@ -12,35 +11,51 @@ import (
"github.com/tmc/langchaingo/llms/ollama"
)

func MakeNewPage(data types.PageWrapper, siteUrl string, report types.ProjectReport, llm *ollama.LLM, ctx context.Context) (common.DocsPage, types.ProjectReport) {
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.Data.AST)
func MakeNewPage(data types.PageMetadata, projectName string, siteUrl string, llm *ollama.LLM, ctx context.Context) common.DocsPage {
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.AST)
incomingCodeNodeCount := len(incomingCodeNodes)
incomingLiteralIncludeNodeCount := len(incomingLiteralIncludeNodes)
incomingIoCodeNodeCount := len(incomingIoCodeBlockNodes)
pageId := utils.ConvertSnootyPageIdToAtlasPageId(data.Data.PageID)
pageUrl := utils.ConvertSnootyPageIdToProductionUrl(data.Data.PageID, siteUrl)
product, subProduct := GetProductSubProduct(report.ProjectName, pageUrl)
pageId := utils.ConvertSnootyPageIdToAtlasPageId(data.PageID)
pageUrl := utils.ConvertSnootyPageIdToProductionUrl(data.PageID, siteUrl)
product, subProduct := GetProductSubProduct(projectName, pageUrl)
var isDriversProject bool
if product == "Drivers" {
isDriversProject = true
} else {
isDriversProject = false
}
newAppliedUsageExampleCount := 0
var newCodeNodes []common.CodeNode

// Some of the new code examples coming in from the page may be duplicates. So we first make Sha256 hashes of the
// incoming code examples, and count the number of times the hash appears on the page.
snootySha256Hashes := make(map[string]int)
snootySha256ToAstNodeMap := make(map[string]types.ASTNode)

for _, node := range incomingCodeNodes {
newNode := snooty.MakeCodeNodeFromSnootyAST(node, llm, ctx, isDriversProject)
newCodeNodes = append(newCodeNodes, newNode)
if add_code_examples.IsNewAppliedUsageExample(newNode) {
newAppliedUsageExampleCount++
// This makes a hash from the whitespace-trimmed AST node. We trim whitespace on AST nodes before adding
// them to the DB, so this ensures an incoming node hash can match a whitespace-trimmed existing node hash.
hash := snooty.MakeSha256HashForCode(node.Value)

// Add the hash as an entry in the map, and increment its counter. If the hash does not already exist in the map,
// this will create it. If it does already exist, this will just increment its counter.
snootySha256Hashes[hash]++
snootySha256ToAstNodeMap[hash] = node
}

// Then, we go through the hashes, create the corresponding codeNodes, and set the `InstancesOnPage` if the example
// appears more than once on the page.
var newCodeNodes []common.CodeNode
for hash, count := range snootySha256Hashes {
newNode := snooty.MakeCodeNodeFromSnootyAST(snootySha256ToAstNodeMap[hash], llm, ctx, isDriversProject)
if count > 1 {
newNode.InstancesOnPage = count
}
newCodeNodes = append(newCodeNodes, newNode)
}
maybeKeywords := snooty.GetMetaKeywords(data.Data.AST.Children)

languagesArrayValues := MakeLanguagesArray(newCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes)
maybeKeywords := snooty.GetMetaKeywords(data.AST.Children)

// Report relevant details for the new page
report = UpdateProjectReportForNewPage(incomingCodeNodeCount, incomingLiteralIncludeNodeCount, incomingIoCodeNodeCount, len(newCodeNodes), newAppliedUsageExampleCount, pageId, report)
languagesArrayValues := MakeLanguagesArray(newCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes)

return common.DocsPage{
ID: pageId,
Expand All @@ -52,9 +67,9 @@ func MakeNewPage(data types.PageWrapper, siteUrl string, report types.ProjectRep
LiteralIncludesTotal: incomingLiteralIncludeNodeCount,
Nodes: &newCodeNodes,
PageURL: pageUrl,
ProjectName: report.ProjectName,
ProjectName: projectName,
Product: product,
SubProduct: subProduct,
Keywords: maybeKeywords,
}, report
}
}
16 changes: 11 additions & 5 deletions audit/gdcd/UpdateExistingPage.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ package main
import (
"common"
"context"
add_code_examples "gdcd/add-code-examples"
compare_code_examples "gdcd/compare-code-examples"
"gdcd/add-code-examples"
"gdcd/compare-code-examples"
"gdcd/db"
"gdcd/snooty"
"gdcd/types"
Expand All @@ -17,14 +17,15 @@ import (
func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, projectReport types.ProjectReport, llm *ollama.LLM, ctx context.Context) (*common.DocsPage, types.ProjectReport) {
var existingCurrentCodeNodes []common.CodeNode
var existingRemovedCodeNodes []common.CodeNode
existingCodeNodeCount := 0
// Some of the existing Nodes on the page could have been previously removed from the page. So we need to know which
// nodes are "currently" on the page, and which nodes have already been removed. The ones that are "currently" on the
// page should be used to compare code examples, but the ones that have already been removed from the page will be
// appended to the Nodes array without changes after making all the other updates.
if existingPage.Nodes != nil {
existingCurrentCodeNodes, existingRemovedCodeNodes = db.GetCurrentRemovedAtlasCodeNodes(*existingPage.Nodes)
existingCodeNodeCount = compare_code_examples.GetCodeNodeCount(*existingPage.Nodes)
}
existingCodeNodeCount := len(existingCurrentCodeNodes)
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.Data.AST)
maybePageKeywords := snooty.GetMetaKeywords(data.Data.AST.Children)
newAppliedUsageExampleCount := 0
Expand Down Expand Up @@ -75,7 +76,12 @@ func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, pr
node.DateRemoved = time.Now()
node.IsRemoved = true
updatedCodeNodes = append(updatedCodeNodes, node)
newRemovedNodeCount++
if node.InstancesOnPage > 0 {
newRemovedNodeCount += node.InstancesOnPage
node.InstancesOnPage = 0
} else {
newRemovedNodeCount++
}
} else {
updatedCodeNodes = append(updatedCodeNodes, node)
}
Expand Down Expand Up @@ -123,7 +129,7 @@ func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, pr
newAppliedUsageExampleCount++
}
}
newCodeNodeCount := len(newCodeNodes)
newCodeNodeCount := compare_code_examples.GetCodeNodeCount(newCodeNodes)
updatedPage.Nodes = &newCodeNodes

// Update the AST code node count, io-block-count and literalinclude count
Expand Down
40 changes: 27 additions & 13 deletions audit/gdcd/UpdateProjectReportForNewPage.go
Original file line number Diff line number Diff line change
@@ -1,29 +1,43 @@
package main

import (
"common"
"gdcd/add-code-examples"
compare_code_examples "gdcd/compare-code-examples"
"gdcd/types"
"gdcd/utils"
)

func UpdateProjectReportForNewPage(incomingCodeNodeCount int, incomingLiteralIncludeNodeCount int, incomingIoCodeBlockNodeCount int, newCodeNodes int, newAppliedUsageExampleCount int, pageId string, report types.ProjectReport) types.ProjectReport {
report.Counter.IncomingCodeNodesCount += incomingCodeNodeCount
report.Counter.IncomingLiteralIncludeCount += incomingLiteralIncludeNodeCount
report.Counter.IncomingIoCodeBlockCount += incomingIoCodeBlockNodeCount
report.Counter.NewCodeNodesCount += newCodeNodes
report.Counter.NewAppliedUsageExamplesCount += newAppliedUsageExampleCount
func UpdateProjectReportForNewPage(page common.DocsPage, report types.ProjectReport) types.ProjectReport {
report.Counter.IncomingCodeNodesCount += page.CodeNodesTotal
report.Counter.IncomingLiteralIncludeCount += page.LiteralIncludesTotal
report.Counter.IncomingIoCodeBlockCount += page.IoCodeBlocksTotal
report.Counter.NewCodeNodesCount += page.CodeNodesTotal
report.Counter.NewPagesCount += 1
report = utils.ReportChanges(types.PageCreated, report, pageId)
if newCodeNodes > 0 {
report = utils.ReportChanges(types.CodeExampleCreated, report, pageId, newCodeNodes)
report = utils.ReportChanges(types.PageCreated, report, page.ID)
if page.CodeNodesTotal > 0 {
report = utils.ReportChanges(types.CodeExampleCreated, report, page.ID, page.CodeNodesTotal)
}

// Figure out how many of the page's code examples are new applied usage examples
newAppliedUsageExampleCount := 0
newCodeNodeCount := 0
if page.Nodes != nil {
for _, node := range *page.Nodes {
if add_code_examples.IsNewAppliedUsageExample(node) {
newAppliedUsageExampleCount++
}
}
newCodeNodeCount = compare_code_examples.GetCodeNodeCount(*page.Nodes)
}
report.Counter.NewAppliedUsageExamplesCount += newAppliedUsageExampleCount

if newAppliedUsageExampleCount > 0 {
report = utils.ReportChanges(types.AppliedUsageExampleAdded, report, pageId, newAppliedUsageExampleCount)
report = utils.ReportChanges(types.AppliedUsageExampleAdded, report, page.ID, newAppliedUsageExampleCount)
}

newCodeNodeCount := newCodeNodes
if incomingCodeNodeCount != newCodeNodeCount {
report = utils.ReportIssues(types.CodeNodeCountIssue, report, pageId, incomingCodeNodeCount, newCodeNodeCount)
if page.CodeNodesTotal != newCodeNodeCount {
report = utils.ReportIssues(types.CodeNodeCountIssue, report, page.ID, page.CodeNodesTotal, newCodeNodeCount)
}
return report
}
Loading