diff --git a/package.json b/package.json index 506d732d..a053c5b8 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "hawk.workers", "private": true, - "version": "0.1.1", + "version": "0.1.2", "description": "Hawk workers", "repository": "git@github.com:codex-team/hawk.workers.git", "license": "BUSL-1.1", diff --git a/workers/grouper/package.json b/workers/grouper/package.json index 6e98edc1..294322a5 100644 --- a/workers/grouper/package.json +++ b/workers/grouper/package.json @@ -1,6 +1,6 @@ { "name": "hawk-worker-grouper", - "version": "0.0.1", + "version": "0.0.2", "description": "Accepts processed errors from language-workers and saves it to the DB with grouping of similar ones. ", "main": "src/index.ts", "repository": "https://github.com/codex-team/hawk.workers/tree/master/workers/grouper", diff --git a/workers/grouper/src/data-filter.ts b/workers/grouper/src/data-filter.ts index 40a4acf9..2345b8e5 100644 --- a/workers/grouper/src/data-filter.ts +++ b/workers/grouper/src/data-filter.ts @@ -1,6 +1,11 @@ import type { EventAddons, EventData } from '@hawk.so/types'; import { unsafeFields } from '../../../lib/utils/unsafeFields'; +/** + * Maximum depth for object traversal to prevent excessive memory allocations + */ +const MAX_TRAVERSAL_DEPTH = 20; + /** * Recursively iterate through object and call function on each key * @@ -18,7 +23,12 @@ function forAll(obj: Record, callback: (path: string[], key: st if (!(typeof value === 'object' && !Array.isArray(value))) { callback(path, key, current); } else { - visit(value, [...path, key]); + /** + * Limit path depth to prevent excessive memory allocations from deep nesting + * This reduces GC pressure and memory usage for deeply nested objects + */ + const newPath = path.length < MAX_TRAVERSAL_DEPTH ? path.concat(key) : path; + visit(value, newPath); } } }; @@ -36,17 +46,71 @@ export default class DataFilter { private filteredValuePlaceholder = '[filtered]'; /** - * Possibly sensitive keys + * Possibly sensitive keys (lowercase; keys are compared via key.toLowerCase()) */ private possiblySensitiveDataKeys = new Set([ - 'pan', - 'secret', - 'credentials', - 'card[number]', - 'password', + /** + * Authorization and sessions + */ 'auth', + 'authorization', 'access_token', 'accesstoken', + 'token', + 'jwt', + 'session', + 'sessionid', + 'session_id', + /** + * API keys and secure tokens + */ + 'api_key', + 'apikey', + 'x-api-key', + 'x-auth-token', + 'bearer', + 'client_secret', + 'secret', + 'credentials', + /** + * Passwords + */ + 'password', + 'passwd', + 'mysql_pwd', + 'oldpassword', + 'old-password', + 'old_password', + 'newpassword', + 'new-password', + 'new_password', + /** + * Encryption keys + */ + 'private_key', + 'ssh_key', + /** + * Payments data + */ + 'card', + 'cardnumber', + 'card[number]', + 'creditcard', + 'credit_card', + 'pan', + 'pin', + 'security_code', + 'stripetoken', + 'cloudpayments_public_id', + 'cloudpayments_secret', + /** + * Config and connections + */ + 'dsn', + /** + * Personal data + */ + 'ssn', ]); /** @@ -54,6 +118,16 @@ export default class DataFilter { */ private bankCardRegex = /^(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})$/g; + /** + * MongoDB ObjectId Regex (24 hexadecimal characters) + */ + private objectIdRegex = /^[0-9a-fA-F]{24}$/; + + /** + * UUID Regex - matches UUIDs with all dashes (8-4-4-4-12 format) or no dashes (32 hex chars) + */ + private uuidRegex = /^(?:[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}|[0-9a-fA-F]{32})$/; + /** * Accept event and process 'addons' and 'context' fields. * It mutates the original object @@ -96,12 +170,30 @@ export default class DataFilter { return value; } + /** + * Check if value matches MongoDB ObjectId pattern (24 hex chars) + * ObjectIds should not be filtered + */ + if (this.objectIdRegex.test(value)) { + return value; + } + + /** + * Check if value matches UUID pattern (with or without dashes) + * UUIDs should not be filtered + */ + if (this.uuidRegex.test(value)) { + return value; + } + /** * Remove all non-digit chars */ const clean = value.replace(/\D/g, ''); - // Reset last index to 0 + /** + * Reset last index to 0 + */ this.bankCardRegex.lastIndex = 0; if (!this.bankCardRegex.test(clean)) { return value; diff --git a/workers/grouper/src/index.ts b/workers/grouper/src/index.ts index 73f16fc7..542c701f 100644 --- a/workers/grouper/src/index.ts +++ b/workers/grouper/src/index.ts @@ -19,11 +19,16 @@ import type { RepetitionDBScheme } from '../types/repetition'; import { DatabaseReadWriteError, DiffCalculationError, ValidationError } from '../../../lib/workerErrors'; import { decodeUnsafeFields, encodeUnsafeFields } from '../../../lib/utils/unsafeFields'; import { MS_IN_SEC } from '../../../lib/utils/consts'; +import TimeMs from '../../../lib/utils/time'; import DataFilter from './data-filter'; import RedisHelper from './redisHelper'; import { computeDelta } from './utils/repetitionDiff'; import { rightTrim } from '../../../lib/utils/string'; import { hasValue } from '../../../lib/utils/hasValue'; + +/** + * eslint does not count decorators as a variable usage + */ /* eslint-disable-next-line no-unused-vars */ import { memoize } from '../../../lib/memoize'; @@ -31,7 +36,12 @@ import { memoize } from '../../../lib/memoize'; * eslint does not count decorators as a variable usage */ /* eslint-disable-next-line no-unused-vars */ -const MEMOIZATION_TTL = Number(process.env.MEMOIZATION_TTL ?? 0); +const MEMOIZATION_TTL = 600_000; + +/** + * Cache cleanup interval in minutes + */ +const CACHE_CLEANUP_INTERVAL_MINUTES = 5; /** * Error code of MongoDB key duplication error @@ -72,6 +82,11 @@ export default class GrouperWorker extends Worker { */ private redis = new RedisHelper(); + /** + * Interval for periodic cache cleanup to prevent memory leaks from unbounded cache growth + */ + private cacheCleanupInterval: NodeJS.Timeout | null = null; + /** * Start consuming messages */ @@ -85,6 +100,15 @@ export default class GrouperWorker extends Worker { await this.redis.initialize(); console.log('redis initialized'); + + /** + * Start periodic cache cleanup to prevent memory leaks from unbounded cache growth + * Runs every 5 minutes to clear old cache entries + */ + this.cacheCleanupInterval = setInterval(() => { + this.clearCache(); + }, CACHE_CLEANUP_INTERVAL_MINUTES * TimeMs.MINUTE); + await super.start(); } @@ -92,6 +116,14 @@ export default class GrouperWorker extends Worker { * Finish everything */ public async finish(): Promise { + /** + * Clear cache cleanup interval to prevent resource leaks + */ + if (this.cacheCleanupInterval) { + clearInterval(this.cacheCleanupInterval); + this.cacheCleanupInterval = null; + } + await super.finish(); this.prepareCache(); await this.eventsDb.close(); @@ -237,6 +269,12 @@ export default class GrouperWorker extends Worker { } as RepetitionDBScheme; repetitionId = await this.saveRepetition(task.projectId, newRepetition); + + /** + * Clear the large event payload references to allow garbage collection + * This prevents memory leaks from retaining full event objects after delta is computed + */ + delta = undefined; } /** @@ -334,7 +372,7 @@ export default class GrouperWorker extends Worker { * @param projectId - where to find * @param title - title of the event to find similar one */ - @memoize({ max: 200, ttl: MEMOIZATION_TTL, strategy: 'hash', skipCache: [undefined] }) + @memoize({ max: 50, ttl: MEMOIZATION_TTL, strategy: 'hash', skipCache: [undefined] }) private async findSimilarEvent(projectId: string, title: string): Promise { /** * If no match by Levenshtein, try matching by patterns diff --git a/workers/grouper/tests/data-filter.test.ts b/workers/grouper/tests/data-filter.test.ts index d0a4c3af..2f00dd68 100644 --- a/workers/grouper/tests/data-filter.test.ts +++ b/workers/grouper/tests/data-filter.test.ts @@ -28,20 +28,57 @@ function generateEvent({ context, addons }: {context?: Json, addons?: EventAddon } /** - * Example of object with sensitive information + * Example of object with sensitive information. + * Keys intentionally use snake_case/kebab-case to match data-filter list. */ +/* eslint-disable @typescript-eslint/naming-convention */ const sensitiveDataMock = { pan: '5500 0000 0000 0004', secret: 'D6A03F5C2E0E356F262D56F44370E1CD813583B2', credentials: '70BA33708CBFB103F1A8E34AFEF333BA7DC021022B2D9AAA583AABB8058D8D67', 'card[number]': '5500 0000 0000 0004', password: 'bFb7PBm6nZ7RJRq9', + oldpassword: 'oldSecret123', + newpassword: 'newSecret456', + 'old-password': 'oldSecretHyphen', + old_password: 'oldSecretUnderscore', + 'new-password': 'newSecretHyphen', + new_password: 'newSecretUnderscore', auth: 'C4CA4238A0B923820DCC509A6F75849B', - // eslint-disable-next-line @typescript-eslint/naming-convention access_token: '70BA33708CBFB103F1A8E34AFEF333BA7DC021022B2D9AAA583AABB8058D8D67', accessToken: '70BA33708CBFB103F1A8E34AFEF333BA7DC021022B2D9AAA583AABB8058D8D67', }; +/** + * Additional sensitive keys (newly added / previously uncovered). + * Keys intentionally use snake_case to match data-filter list. + */ +const additionalSensitiveDataMock = { + authorization: 'Bearer abc123', + token: 'token-value', + jwt: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9', + session: 'sess_xyz', + session_id: 'sid_789', + api_key: 'sk_live_xxx', + bearer: 'Bearer token', + client_secret: 'client_secret_value', + passwd: 'passwd_value', + mysql_pwd: 'mysql_pwd_value', + private_key: '-----BEGIN PRIVATE KEY-----', + ssh_key: 'ssh-rsa AAAA...', + card: '4111111111111111', + cardnumber: '5500000000000004', + creditcard: '4111111111111111', + pin: '1234', + security_code: '999', + stripetoken: 'tok_xxx', + cloudpayments_public_id: 'pk_xxx', + cloudpayments_secret: 'secret_xxx', + dsn: 'postgres://user:pass@host/db', + ssn: '123-45-6789', +}; +/* eslint-enable @typescript-eslint/naming-convention */ + describe('GrouperWorker', () => { const dataFilter = new DataFilter(); @@ -123,6 +160,34 @@ describe('GrouperWorker', () => { }); }); + test('should filter additional sensitive keys (authorization, token, payment, dsn, ssn, etc.) in context', async () => { + const event = generateEvent({ + context: additionalSensitiveDataMock, + }); + + dataFilter.processEvent(event); + + Object.keys(additionalSensitiveDataMock).forEach((key) => { + expect(event.context[key]).toBe('[filtered]'); + }); + }); + + test('should filter additional sensitive keys in addons', async () => { + const event = generateEvent({ + addons: { + vue: { + props: additionalSensitiveDataMock, + }, + }, + }); + + dataFilter.processEvent(event); + + Object.keys(additionalSensitiveDataMock).forEach((key) => { + expect(event.addons['vue']['props'][key]).toBe('[filtered]'); + }); + }); + test('should not replace values with keynames not in a list', async () => { const normalValue = 'test123'; const event = generateEvent({ @@ -143,5 +208,158 @@ describe('GrouperWorker', () => { expect(event.context['normalKey']).toBe(normalValue); expect(event.addons['vue']['props']['normalKey']).toBe(normalValue); }); + + test('should not filter UUID values that contain exactly 16 digits', async () => { + // These UUIDs contain exactly 16 digits, which when cleaned match PAN patterns + // Without UUID detection, they would be incorrectly filtered as credit cards + const uuidWithManyDigits = '4a1b2c3d-4e5f-6a7b-8c9d-0e1f2a3b4c5d'; // Cleans to 16 digits starting with 4 + const uuidUpperCase = '5A1B2C3D-4E5F-6A7B-8C9D-0E1F2A3B4C5D'; // Cleans to 16 digits starting with 5 + const uuidNoDashes = '2a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d'; // 32 hex chars without dashes + + const event = generateEvent({ + context: { + userId: uuidWithManyDigits, + requestId: uuidUpperCase, + transactionId: uuidNoDashes, + }, + addons: { + vue: { + props: { + componentId: uuidWithManyDigits, + }, + }, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['userId']).toBe(uuidWithManyDigits); + expect(event.context['requestId']).toBe(uuidUpperCase); + expect(event.context['transactionId']).toBe(uuidNoDashes); + expect(event.addons['vue']['props']['componentId']).toBe(uuidWithManyDigits); + }); + + test('should not filter MongoDB ObjectId values that contain exactly 16 digits', async () => { + // These ObjectIds contain exactly 16 digits which when cleaned match PAN patterns + // Without ObjectId detection, they would be incorrectly filtered as credit cards + const objectIdWithManyDigits = '4111111111111111abcdefab'; // 16 digits + 8 hex letters = 24 chars, cleans to Visa pattern + const objectIdUpperCase = '5111111111111111ABCDEFAB'; // Cleans to Mastercard pattern + const objectIdMixedCase = '2111111111111111AbCdEfAb'; // Cleans to Maestro/Mastercard pattern + + const event = generateEvent({ + context: { + projectId: objectIdWithManyDigits, + workspaceId: objectIdUpperCase, + transactionId: objectIdMixedCase, + }, + addons: { + hawk: { + projectId: objectIdWithManyDigits, + }, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['projectId']).toBe(objectIdWithManyDigits); + expect(event.context['workspaceId']).toBe(objectIdUpperCase); + expect(event.context['transactionId']).toBe(objectIdMixedCase); + expect(event.addons['hawk']['projectId']).toBe(objectIdWithManyDigits); + }); + + test('should still filter actual PAN numbers with formatting characters', async () => { + // Test real Mastercard test number with spaces and dashes + const panWithSpaces = '5500 0000 0000 0004'; + const panWithDashes = '5500-0000-0000-0004'; + + const event = generateEvent({ + context: { + cardNumber: panWithSpaces, + paymentCard: panWithDashes, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['cardNumber']).toBe('[filtered]'); + expect(event.context['paymentCard']).toBe('[filtered]'); + }); + + test('should not filter values that are not UUIDs, ObjectIds, or PANs', async () => { + // These are edge cases that should NOT be filtered + const shortHex = '507f1f77bcf86cd7'; // 16 hex chars (not 24) + const longNumber = '67280841958304100309082499'; // 26 digits (too long for PAN) + const mixedAlphaNum = 'abc123def456ghi789'; // Mixed content + + const event = generateEvent({ + context: { + shortId: shortHex, + longId: longNumber, + mixedId: mixedAlphaNum, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['shortId']).toBe(shortHex); + expect(event.context['longId']).toBe(longNumber); + expect(event.context['mixedId']).toBe(mixedAlphaNum); + }); + + test('should filter UUIDs and ObjectIds when they are in sensitive key fields', async () => { + // Even if the value is a valid UUID or ObjectId, it should be filtered + // if the key name is in the sensitive keys list + const uuid = '550e8400-e29b-41d4-a716-446655440000'; + const objectId = '507f1f77bcf86cd799439011'; + + const event = generateEvent({ + context: { + password: uuid, + secret: objectId, + auth: '672808419583041003090824', + }, + }); + + dataFilter.processEvent(event); + + // All should be filtered because of sensitive key names + expect(event.context['password']).toBe('[filtered]'); + expect(event.context['secret']).toBe('[filtered]'); + expect(event.context['auth']).toBe('[filtered]'); + }); + + test('should handle deeply nested objects (>20 levels) without excessive memory allocations', () => { + // Create an object nested deeper than the cap (>20 levels) + let deeplyNested: any = { value: 'leaf', secret: 'should-be-filtered' }; + + for (let i = 0; i < 25; i++) { + deeplyNested = { [`level${i}`]: deeplyNested, password: `sensitive${i}` }; + } + + const event = generateEvent({ + context: deeplyNested, + }); + + // This should not throw or cause memory issues + dataFilter.processEvent(event); + + // Verify that filtering still works at various depths + expect(event.context['password']).toBe('[filtered]'); + + // Navigate to a mid-level and check filtering + let current = event.context['level24'] as any; + for (let i = 24; i > 15; i--) { + expect(current['password']).toBe('[filtered]'); + current = current[`level${i - 1}`]; + } + + // At the leaf level, the secret should still be filtered + // (though path tracking may be capped, filtering should still work) + let leaf = event.context; + for (let i = 24; i >= 0; i--) { + leaf = leaf[`level${i}`] as any; + } + expect(leaf['secret']).toBe('[filtered]'); + }); }); });