diff --git a/package-lock.json b/package-lock.json index 38e96a97eae..dd28eabe03b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -44366,6 +44366,7 @@ "@mongodb-js/compass-workspaces": "^0.51.0", "@mongodb-js/connection-info": "^0.17.1", "@mongodb-js/mongodb-constants": "^0.12.2", + "bson": "^6.10.1", "compass-preferences-model": "^2.50.0", "hadron-document": "^8.9.5", "mongodb": "^6.17.0", @@ -57155,6 +57156,7 @@ "@types/react": "^17.0.5", "@types/react-dom": "^17.0.10", "@types/sinon-chai": "^3.2.5", + "bson": "^6.10.1", "chai": "^4.3.6", "compass-preferences-model": "^2.50.0", "depcheck": "^1.4.1", diff --git a/packages/compass-collection/package.json b/packages/compass-collection/package.json index 7a29fa00f77..b520c0c15e1 100644 --- a/packages/compass-collection/package.json +++ b/packages/compass-collection/package.json @@ -66,7 +66,8 @@ "react": "^17.0.2", "react-redux": "^8.1.3", "redux": "^4.2.1", - "redux-thunk": "^2.4.2" + "redux-thunk": "^2.4.2", + "bson": "^6.10.1" }, "devDependencies": { "@mongodb-js/eslint-config-compass": "^1.4.6", diff --git a/packages/compass-collection/src/modules/collection-tab.ts b/packages/compass-collection/src/modules/collection-tab.ts index 405dfd333be..874e4026035 100644 --- a/packages/compass-collection/src/modules/collection-tab.ts +++ b/packages/compass-collection/src/modules/collection-tab.ts @@ -1,5 +1,5 @@ import type { Reducer, AnyAction, Action } from 'redux'; -import { analyzeDocuments, type Schema } from 'mongodb-schema'; +import { analyzeDocuments } from 'mongodb-schema'; import type { CollectionMetadata } from 'mongodb-collection-model'; import type { ThunkAction } from 'redux-thunk'; @@ -19,8 +19,10 @@ import { SCHEMA_ANALYSIS_STATE_INITIAL, type SchemaAnalysisError, type SchemaAnalysisState, + type FieldInfo, } from '../schema-analysis-types'; import { calculateSchemaDepth } from '../calculate-schema-depth'; +import { processSchema } from '../transform-schema-to-field-info'; import type { Document, MongoError } from 'mongodb'; const DEFAULT_SAMPLE_SIZE = 100; @@ -106,7 +108,7 @@ interface SchemaAnalysisStartedAction { interface SchemaAnalysisFinishedAction { type: CollectionActions.SchemaAnalysisFinished; - schema: Schema; + processedSchema: Record; sampleDocument: Document; schemaMetadata: { maxNestingDepth: number; @@ -201,7 +203,7 @@ const reducer: Reducer = ( ...state, schemaAnalysis: { status: SCHEMA_ANALYSIS_STATE_COMPLETE, - schema: action.schema, + processedSchema: action.processedSchema, sampleDocument: action.sampleDocument, schemaMetadata: action.schemaMetadata, }, @@ -420,7 +422,9 @@ export const analyzeCollectionSchema = (): CollectionThunkAction< schema.fields = schema.fields.filter( ({ path }) => !isInternalFieldPath(path[0]) ); - // TODO: Transform schema to structure that will be used by the LLM. + + // Transform schema to structure that will be used by the LLM + const processedSchema = processSchema(schema); const maxNestingDepth = await calculateSchemaDepth(schema); const { database, collection } = toNS(namespace); @@ -432,7 +436,7 @@ export const analyzeCollectionSchema = (): CollectionThunkAction< }; dispatch({ type: CollectionActions.SchemaAnalysisFinished, - schema, + processedSchema, sampleDocument: sampleDocuments[0], schemaMetadata, }); diff --git a/packages/compass-collection/src/schema-analysis-types.ts b/packages/compass-collection/src/schema-analysis-types.ts index 0d63615f77a..83a501ceb20 100644 --- a/packages/compass-collection/src/schema-analysis-types.ts +++ b/packages/compass-collection/src/schema-analysis-types.ts @@ -1,5 +1,5 @@ import type { Document } from 'mongodb'; -import { type Schema } from 'mongodb-schema'; +import type { PrimitiveSchemaType } from 'mongodb-schema'; export const SCHEMA_ANALYSIS_STATE_INITIAL = 'initial'; export const SCHEMA_ANALYSIS_STATE_ANALYZING = 'analyzing'; @@ -30,9 +30,35 @@ export type SchemaAnalysisErrorState = { error: SchemaAnalysisError; }; +/** + * MongoDB schema type + */ +export type MongoDBFieldType = PrimitiveSchemaType['name']; + +/** + * Primitive values that can appear in sample_values after BSON-to-primitive conversion. + * These are the JavaScript primitive equivalents of BSON values. + */ +export type SampleValue = + | string // String, Symbol, ObjectId, Binary, RegExp, Code, etc. (converted to string) + | number // Number, Int32, Long, Double, Decimal128, Timestamp (converted via valueOf()) + | boolean + | Date + | null + | undefined; + +/** + * Schema field information (for LLM processing) + */ +export interface FieldInfo { + type: MongoDBFieldType; // MongoDB primitive type + sample_values?: SampleValue[]; // Primitive sample values (limited to 10) + probability?: number; // 0.0 - 1.0 field frequency +} + export type SchemaAnalysisCompletedState = { status: typeof SCHEMA_ANALYSIS_STATE_COMPLETE; - schema: Schema; + processedSchema: Record; sampleDocument: Document; schemaMetadata: { maxNestingDepth: number; diff --git a/packages/compass-collection/src/transform-schema-to-field-info.spec.ts b/packages/compass-collection/src/transform-schema-to-field-info.spec.ts new file mode 100644 index 00000000000..04002bcda05 --- /dev/null +++ b/packages/compass-collection/src/transform-schema-to-field-info.spec.ts @@ -0,0 +1,1108 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +import { expect } from 'chai'; +import { + Int32, + Double, + ObjectId, + Binary, + BSONRegExp, + Code, + BSONSymbol, + Timestamp, + MaxKey, + MinKey, + Long, + Decimal128, +} from 'bson'; +import { processSchema } from './transform-schema-to-field-info'; +import type { Schema } from 'mongodb-schema'; + +describe('processSchema', function () { + it('selects most probable type when multiple types exist', function () { + const schema: Schema = { + fields: [ + { + name: 'mixed', + path: ['mixed'], + count: 10, + type: ['String', 'Number'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['mixed'], + count: 8, + probability: 0.8, + values: ['text'], + }, + { + name: 'Number', + bsonType: 'Number', + path: ['mixed'], + count: 2, + probability: 0.2, + values: [new Int32(42)], + }, + ], + }, + ], + count: 10, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + mixed: { + type: 'String', // Should pick the most probable type + sample_values: ['text'], + probability: 1.0, + }, + }); + }); + + it('filters out undefined and null types', function () { + const schema: Schema = { + fields: [ + { + name: 'optional', + path: ['optional'], + count: 3, + type: ['String', 'Undefined', 'Null'], + probability: 0.67, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['optional'], + count: 1, + probability: 0.33, + values: ['value'], + }, + { + name: 'Undefined', + bsonType: 'Undefined', + path: ['optional'], + count: 1, + probability: 0.33, + }, + { + name: 'Null', + bsonType: 'Null', + path: ['optional'], + count: 1, + probability: 0.33, + }, + ], + }, + ], + count: 3, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + optional: { + type: 'String', + sample_values: ['value'], + probability: 0.67, + }, + }); + }); + + it('handles fields with no types', function () { + const schema: Schema = { + fields: [ + { + name: 'empty', + path: ['empty'], + count: 0, + type: [], + hasDuplicates: false, + probability: 0.0, + types: [], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({}); + }); + + it('handles empty schema', function () { + const schema: Schema = { + fields: [], + count: 0, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({}); + }); + + it('limits sample values to 10', function () { + const manyValues = Array.from({ length: 20 }, (_, i) => `value${i}`); + + const schema: Schema = { + fields: [ + { + name: 'field', + path: ['field'], + count: 20, + type: ['String'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['field'], + count: 20, + probability: 1.0, + values: manyValues, + }, + ], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result.field.sample_values).to.have.length(10); + expect(result.field.sample_values).to.deep.equal(manyValues.slice(0, 10)); + }); + + it('transforms simple primitive fields', function () { + const schema: Schema = { + fields: [ + { + name: 'name', + path: ['name'], + count: 3, + type: ['String'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['name'], + count: 3, + probability: 1.0, + values: ['John', 'Jane', 'Bob'], + }, + ], + }, + { + name: 'age', + path: ['age'], + count: 3, + type: ['Number'], + probability: 0.9, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['age'], + count: 3, + probability: 1.0, + values: [new Int32(25), new Int32(30), new Int32(35)], + }, + ], + }, + { + name: 'isActive', + path: ['isActive'], + count: 3, + type: ['Boolean'], + probability: 0.8, + hasDuplicates: false, + types: [ + { + name: 'Boolean', + bsonType: 'Boolean', + path: ['isActive'], + count: 3, + probability: 1.0, + values: [true, false, true], + }, + ], + }, + { + name: 'createdAt', + path: ['createdAt'], + count: 2, + type: ['Date'], + probability: 0.7, + hasDuplicates: false, + types: [ + { + name: 'Date', + bsonType: 'Date', + path: ['createdAt'], + count: 2, + probability: 1.0, + values: [new Date('2023-01-01'), new Date('2023-06-15')], + }, + ], + }, + ], + count: 3, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + name: { + type: 'String', + sample_values: ['John', 'Jane', 'Bob'], + probability: 1.0, + }, + age: { + type: 'Number', + sample_values: [25, 30, 35], + probability: 0.9, + }, + isActive: { + type: 'Boolean', + sample_values: [true, false, true], + probability: 0.8, + }, + createdAt: { + type: 'Date', + sample_values: [new Date('2023-01-01'), new Date('2023-06-15')], + probability: 0.7, + }, + }); + }); + + it('handles various BSON types', function () { + const schema: Schema = { + fields: [ + { + name: 'objectId', + path: ['objectId'], + count: 1, + type: ['ObjectId'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'ObjectId', + bsonType: 'ObjectId', + path: ['objectId'], + count: 1, + probability: 1.0, + values: [new ObjectId('642d766b7300158b1f22e972')], + }, + ], + }, + { + name: 'binary', + path: ['binary'], + count: 1, + type: ['Binary'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Binary', + bsonType: 'Binary', + path: ['binary'], + count: 1, + probability: 1.0, + values: [new Binary(Buffer.from('test'))], + }, + ], + }, + { + name: 'regex', + path: ['regex'], + count: 1, + type: ['RegExp'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'RegExp', + bsonType: 'BSONRegExp', + path: ['regex'], + count: 1, + probability: 1.0, + values: [new BSONRegExp('pattern', 'i')], + }, + ], + }, + { + name: 'code', + path: ['code'], + count: 1, + type: ['Code'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Code', + bsonType: 'Code', + path: ['code'], + count: 1, + probability: 1.0, + values: [new Code('function() {}')], + }, + ], + }, + { + name: 'long', + path: ['long'], + count: 1, + type: ['Long'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Long', + bsonType: 'Long', + path: ['long'], + count: 1, + probability: 1.0, + values: [Long.fromNumber(123456789)], + }, + ], + }, + { + name: 'decimal', + path: ['decimal'], + count: 1, + type: ['Decimal128'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Decimal128', + bsonType: 'Decimal128', + path: ['decimal'], + count: 1, + probability: 1.0, + values: [Decimal128.fromString('123.456')], + }, + ], + }, + { + name: 'timestamp', + path: ['timestamp'], + count: 1, + type: ['Timestamp'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Timestamp', + bsonType: 'Timestamp', + path: ['timestamp'], + count: 1, + probability: 1.0, + values: [new Timestamp({ t: 1, i: 1 })], + }, + ], + }, + { + name: 'maxKey', + path: ['maxKey'], + count: 1, + type: ['MaxKey'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'MaxKey', + bsonType: 'MaxKey', + path: ['maxKey'], + count: 1, + probability: 1.0, + values: [new MaxKey()], + }, + ], + }, + { + name: 'minKey', + path: ['minKey'], + count: 1, + type: ['MinKey'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'MinKey', + bsonType: 'MinKey', + path: ['minKey'], + count: 1, + probability: 1.0, + values: [new MinKey()], + }, + ], + }, + { + name: 'symbol', + path: ['symbol'], + count: 1, + type: ['Symbol'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Symbol', + bsonType: 'BSONSymbol', + path: ['symbol'], + count: 1, + probability: 1.0, + values: [new BSONSymbol('symbol')], + }, + ], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + objectId: { + type: 'ObjectId', + sample_values: ['642d766b7300158b1f22e972'], + probability: 1.0, + }, + binary: { + type: 'Binary', + sample_values: ['dGVzdA=='], + probability: 1.0, + }, + regex: { + type: 'RegExp', + sample_values: ['pattern'], + probability: 1.0, + }, + code: { + type: 'Code', + sample_values: ['function() {}'], + probability: 1.0, + }, + long: { + type: 'Long', + sample_values: [123456789], + probability: 1.0, + }, + decimal: { + type: 'Decimal128', + sample_values: [123.456], + probability: 1.0, + }, + timestamp: { + type: 'Timestamp', + sample_values: [4294967297], + probability: 1.0, + }, + maxKey: { + type: 'MaxKey', + sample_values: ['MaxKey'], + probability: 1.0, + }, + minKey: { + type: 'MinKey', + sample_values: ['MinKey'], + probability: 1.0, + }, + symbol: { + type: 'Symbol', + sample_values: ['symbol'], + probability: 1.0, + }, + }); + }); + + it('transforms nested document field', function () { + const schema: Schema = { + fields: [ + { + name: 'user', + path: ['user'], + count: 2, + type: ['Document'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['user'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'name', + path: ['user', 'name'], + count: 1, + type: ['String'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['user', 'name'], + count: 1, + probability: 1.0, + values: ['John'], + }, + ], + }, + { + name: 'age', + path: ['user', 'age'], + count: 2, + type: ['Number'], + probability: 0.8, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['user', 'age'], + count: 2, + probability: 1.0, + values: [new Int32(25), new Int32(30)], + }, + ], + }, + ], + }, + ], + }, + ], + count: 2, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'user.name': { + type: 'String', + sample_values: ['John'], + probability: 1.0, + }, + 'user.age': { + type: 'Number', + sample_values: [25, 30], + probability: 0.8, + }, + }); + }); + + it('transforms array field', function () { + const schema: Schema = { + fields: [ + { + name: 'tags', + path: ['tags'], + count: 2, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['tags'], + count: 2, + probability: 1.0, + + lengths: [2, 1], + averageLength: 1.5, + totalCount: 3, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['tags'], + count: 3, + probability: 1.0, + values: ['red', 'blue', 'green'], + }, + ], + }, + ], + }, + ], + count: 2, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'tags[]': { + type: 'String', + sample_values: ['red', 'blue', 'green'], + probability: 1.0, + }, + }); + }); + + it('handles deeply nested objects (documents)', function () { + const schema: Schema = { + fields: [ + { + name: 'level1', + path: ['level1'], + count: 1, + type: ['Document'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['level1'], + count: 1, + probability: 1.0, + fields: [ + { + name: 'level2', + path: ['level1', 'level2'], + count: 1, + type: ['Document'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['level1', 'level2'], + count: 1, + probability: 1.0, + fields: [ + { + name: 'value', + path: ['level1', 'level2', 'value'], + count: 1, + type: ['String'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['level1', 'level2', 'value'], + count: 1, + probability: 1.0, + values: ['deep'], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'level1.level2.value': { + type: 'String', + sample_values: ['deep'], + probability: 1.0, + }, + }); + }); + + it('handles arrays of documents', function () { + const schema: Schema = { + fields: [ + { + name: 'items', + path: ['items'], + count: 1, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['items'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['items'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'id', + path: ['items', 'id'], + count: 2, + type: ['Number'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['items', 'id'], + count: 2, + probability: 1.0, + values: [new Int32(1), new Int32(2)], + }, + ], + }, + { + name: 'cost', + path: ['items', 'cost'], + count: 2, + type: ['Double'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['items', 'cost'], + count: 2, + probability: 1.0, + values: [new Double(10.5), new Double(25.0)], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'items[].id': { + type: 'Number', + sample_values: [1, 2], + probability: 1.0, + }, + 'items[].cost': { + type: 'Number', + sample_values: [10.5, 25.0], + probability: 1.0, + }, + }); + }); + + it('handles triple nested arrays (3D matrix)', function () { + // cube: [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] + const schema: Schema = { + fields: [ + { + name: 'cube', + path: ['cube'], + count: 1, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['cube'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['cube'], + count: 2, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 4, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['cube'], + count: 4, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 8, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['cube'], + count: 8, + probability: 1.0, + values: [ + new Int32(1), + new Int32(2), + new Int32(3), + new Int32(4), + new Int32(5), + new Int32(6), + new Int32(7), + new Int32(8), + ], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'cube[][][]': { + type: 'Number', + sample_values: [1, 2, 3, 4, 5, 6, 7, 8], + probability: 1.0, + }, + }); + }); + + it('handles arrays of arrays of documents', function () { + const schema: Schema = { + fields: [ + { + name: 'matrix', + path: ['matrix'], + count: 1, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['matrix'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['matrix'], + count: 2, + probability: 1.0, + lengths: [1], + averageLength: 1, + totalCount: 2, + + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['matrix'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'x', + path: ['matrix', 'x'], + count: 2, + type: ['Number'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['matrix', 'x'], + count: 2, + probability: 1.0, + values: [new Int32(1), new Int32(3)], + }, + ], + }, + { + name: 'y', + path: ['matrix', 'y'], + count: 2, + type: ['Number'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['matrix', 'y'], + count: 2, + probability: 1.0, + values: [new Int32(2), new Int32(4)], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'matrix[][].x': { + type: 'Number', + sample_values: [1, 3], + probability: 1.0, + }, + 'matrix[][].y': { + type: 'Number', + sample_values: [2, 4], + probability: 1.0, + }, + }); + }); + + it('handles array of documents with nested arrays', function () { + // teams: [{ name: "Team A", members: ["Alice", "Bob"] }, { name: "Team B", members: ["Charlie"] }] + const schema: Schema = { + fields: [ + { + name: 'teams', + path: ['teams'], + count: 1, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['teams'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['teams'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'name', + path: ['teams', 'name'], + count: 2, + type: ['String'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['teams', 'name'], + count: 2, + probability: 1.0, + values: ['Team A', 'Team B'], + }, + ], + }, + { + name: 'members', + path: ['teams', 'members'], + count: 2, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['teams', 'members'], + count: 2, + probability: 1.0, + lengths: [2, 1], + averageLength: 1.5, + totalCount: 3, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['teams', 'members'], + count: 3, + probability: 1.0, + values: ['Alice', 'Bob', 'Charlie'], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'teams[].name': { + type: 'String', + sample_values: ['Team A', 'Team B'], + probability: 1.0, + }, + 'teams[].members[]': { + type: 'String', + sample_values: ['Alice', 'Bob', 'Charlie'], + probability: 1.0, + }, + }); + }); +}); diff --git a/packages/compass-collection/src/transform-schema-to-field-info.ts b/packages/compass-collection/src/transform-schema-to-field-info.ts new file mode 100644 index 00000000000..9d782c7a187 --- /dev/null +++ b/packages/compass-collection/src/transform-schema-to-field-info.ts @@ -0,0 +1,223 @@ +import type { + Schema, + SchemaField, + SchemaType, + ArraySchemaType, + DocumentSchemaType, + PrimitiveSchemaType, + ConstantSchemaType, +} from 'mongodb-schema'; +import type { FieldInfo, SampleValue } from './schema-analysis-types'; +import { + ObjectId, + Binary, + BSONRegExp, + Code, + Timestamp, + MaxKey, + MinKey, + BSONSymbol, + Long, + Decimal128, +} from 'bson'; + +/** + * This module transforms mongodb-schema output into a flat, LLM-friendly format using + * dot notation for nested fields and bracket notation for arrays. + * + * Algorithm Overview: + * - Start with top-level fields. + * - For each field (processNamedField), process based on type (processType): + * - Primitives: Create result entry + * - Documents: Add parent field name to path using dot notation, recurse into nested fields (processNamedField) + * - Arrays: Add [] to path, recurse into element type (processType) + * + * Notation examples: + * - Nested documents: user.profile.name (dot notation) + * - Array: users[] (bracket notation) + * - Nested arrays: matrix[][] (multiple brackets) + * - Nested array of documents fields: users[].name (brackets + dots) + */ + +/** + * Maximum number of sample values to include for each field + */ +const MAX_SAMPLE_VALUES = 10; + +/** + * Converts a BSON value to its primitive JavaScript equivalent + */ +function convertBSONToPrimitive(value: unknown): SampleValue { + // Handle null/undefined + if (value === null || value === undefined) { + return value; + } + + // Keep Date as-is + if (value instanceof Date) { + return value; + } + + // Convert BSON objects to primitives + if (value instanceof ObjectId) { + return value.toString(); + } + if (value instanceof Binary) { + return value.toString('base64'); + } + if (value instanceof BSONRegExp) { + return value.pattern; + } + if (value instanceof Code) { + return value.code; + } + if (value instanceof Timestamp) { + return value.toNumber(); + } + if (value instanceof MaxKey) { + return 'MaxKey'; + } + if (value instanceof MinKey) { + return 'MinKey'; + } + if (value instanceof BSONSymbol) { + return value.toString(); + } + if (value instanceof Long) { + return value.toNumber(); + } + if (value instanceof Decimal128) { + return parseFloat(value.toString()); + } + + // Handle objects with valueOf method (numeric types) + if (value && typeof value === 'object' && 'valueOf' in value) { + const result = (value as { valueOf(): unknown }).valueOf(); + return result as SampleValue; + } + + return value as SampleValue; +} + +function isConstantSchemaType(type: SchemaType): type is ConstantSchemaType { + return type.name === 'Null' || type.name === 'Undefined'; +} + +function isArraySchemaType(type: SchemaType): type is ArraySchemaType { + return type.name === 'Array'; +} + +function isDocumentSchemaType(type: SchemaType): type is DocumentSchemaType { + return type.name === 'Document'; +} + +function isPrimitiveSchemaType(type: SchemaType): type is PrimitiveSchemaType { + return ( + !isConstantSchemaType(type) && + !isArraySchemaType(type) && + !isDocumentSchemaType(type) + ); +} + +/** + * Transforms a raw mongodb-schema Schema into a flat Record + * using dot notation for nested fields and bracket notation for arrays. + */ +export function processSchema(schema: Schema): Record { + const result: Record = {}; + + if (!schema.fields) { + return result; + } + + // Process each top-level field + for (const field of schema.fields) { + processNamedField(field, '', result); + } + + return result; +} + +/** + * Processes a schema field and its nested types + */ +function processNamedField( + field: SchemaField, + pathPrefix: string, + result: Record +): void { + if (!field.types || field.types.length === 0) { + return; + } + + // Use the most frequent type (excluding 'Undefined') + const primaryType = getMostFrequentType(field.types); + if (!primaryType) { + return; + } + + const currentPath = pathPrefix ? `${pathPrefix}.${field.name}` : field.name; + + // Process based on the type + processType(primaryType, currentPath, result, field.probability); +} + +/** + * Processes a specific schema type + */ +function processType( + type: SchemaType, + currentPath: string, + result: Record, + fieldProbability: number +): void { + if (isConstantSchemaType(type)) { + return; + } + + if (isArraySchemaType(type)) { + // Array: add [] to path and recurse into element type + const elementType = getMostFrequentType(type.types || []); + + if (!elementType) { + return; + } + + const arrayPath = `${currentPath}[]`; + processType(elementType, arrayPath, result, fieldProbability); + } else if (isDocumentSchemaType(type)) { + // Document: Process nested document fields + if (type.fields) { + for (const nestedField of type.fields) { + processNamedField(nestedField, currentPath, result); + } + } + } else if (isPrimitiveSchemaType(type)) { + // Primitive: Create entry + const fieldInfo: FieldInfo = { + type: type.name, + sample_values: type.values + .slice(0, MAX_SAMPLE_VALUES) + .map(convertBSONToPrimitive), + probability: fieldProbability, + }; + + result[currentPath] = fieldInfo; + } +} + +/** + * Gets the most probable type from a list of types, excluding constant types (Null/Undefined) + */ +function getMostFrequentType(types: SchemaType[]): SchemaType | null { + if (!types || types.length === 0) { + return null; + } + + // Filter out constant types (Null/Undefined) and sort by probability + const validTypes = types + .filter((type) => !isConstantSchemaType(type)) + .sort((a, b) => (b.probability || 0) - (a.probability || 0)); + + return validTypes[0] || null; +}