Skip to content

Commit

Permalink
fix(dedupe): improve dedupe operation cost from O(n^2) to O(n)
Browse files Browse the repository at this point in the history
  • Loading branch information
uladkasach committed Jun 18, 2024
1 parent 7a368cd commit 87faa66
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 69 deletions.
83 changes: 77 additions & 6 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
"postinstall": "[ -d .git ] && npx husky install || exit 0"
},
"dependencies": {
"@ehmpathy/error-fns": "^1.1.0",
"@types/joi": "^17.2.3",
"@types/lodash.omit": "^4.5.6",
"@types/yup": "^0.29.6",
Expand Down
3 changes: 1 addition & 2 deletions src/manipulation/getPrimaryIdentifier.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import { UnexpectedCodePathError } from '@ehmpathy/error-fns';
import pick from 'lodash.pick';

import { assertDomainObjectIsSafeToManipulate } from '../constraints/assertDomainObjectIsSafeToManipulate';
import { DomainEntity } from '../instantiation/DomainEntity';
import { DomainEvent } from '../instantiation/DomainEvent';
import { DomainLiteral } from '../instantiation/DomainLiteral';
import { DomainObject } from '../instantiation/DomainObject';
import { Ref } from '../reference/DomainReference';
import { UnexpectedCodePathError } from '../utils/errors/UnexpectedCodePathError';
import { DomainEntityPrimaryKeysMustBeDefinedError } from './DomainEntityPrimaryKeysMustBeDefinedError';

/**
Expand Down
2 changes: 1 addition & 1 deletion src/manipulation/getUniqueIdentifier.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import { UnexpectedCodePathError } from '@ehmpathy/error-fns';
import pick from 'lodash.pick';

import { assertDomainObjectIsSafeToManipulate } from '../constraints/assertDomainObjectIsSafeToManipulate';
import { DomainEntity } from '../instantiation/DomainEntity';
import { DomainEvent } from '../instantiation/DomainEvent';
import { DomainLiteral } from '../instantiation/DomainLiteral';
import { DomainObject } from '../instantiation/DomainObject';
import { UnexpectedCodePathError } from '../utils/errors/UnexpectedCodePathError';
import { DomainEntityUniqueKeysMustBeDefinedError } from './DomainEntityUniqueKeysMustBeDefinedError';

/**
Expand Down
2 changes: 1 addition & 1 deletion src/manipulation/getUpdatableProperties.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { UnexpectedCodePathError } from '@ehmpathy/error-fns';
import pick from 'lodash.pick';

import { assertDomainObjectIsSafeToManipulate } from '../constraints/assertDomainObjectIsSafeToManipulate';
import { DomainEntity } from '../instantiation/DomainEntity';
import { DomainObject } from '../instantiation/DomainObject';
import { UnexpectedCodePathError } from '../utils/errors/UnexpectedCodePathError';
import { DomainEntityUpdatablePropertiesMustBeDefinedError } from './DomainEntityUpdatablePropertiesMustBeDefinedError';

/**
Expand Down
4 changes: 2 additions & 2 deletions src/manipulation/relate/__snapshots__/dedupe.test.ts.snap
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP

exports[`dedupe should fail fast with a helpful error if there are multiple versions of the same entity, by default 1`] = `
"UnexpectedCodePath: More than one version of the same entity found in the array. Can not safely dedupe, since we don't know which version should be kept.
"BadRequestError: Two different versions of the same entity were asked to be deduped. Options.onMultipleEntityVersions !== 'CHOOSE_FIRST_OCCURRENCE', so we're failing fast here, since we don't know which version should be kept.
{"firstOccurrence":{"saltwaterSecurityNumber":"821","name":"Shelly C."},"nextOccurrence":{"saltwaterSecurityNumber":"821","name":"Shellina C."}}"
{"thisObj":{"saltwaterSecurityNumber":"821","name":"Shellina C."},"versionCurrSeen":"{\\"_dobj\\":\\"SeaTurtle\\",\\"name\\":\\"Shellina C.\\",\\"saltwaterSecurityNumber\\":\\"821\\"}","versionPrevSeen":"{\\"_dobj\\":\\"SeaTurtle\\",\\"name\\":\\"Shelly C.\\",\\"saltwaterSecurityNumber\\":\\"821\\"}"}"
`;
2 changes: 1 addition & 1 deletion src/manipulation/relate/dedupe.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ describe('dedupe', () => {
});
const error = getError(() => dedupe([turtleOne, turtleOneWithDiffName]));
expect(error.message).toContain(
'More than one version of the same entity found in the array',
'Two different versions of the same entity were asked to be deduped.',
);
expect(error.message).toMatchSnapshot();
});
Expand Down
109 changes: 69 additions & 40 deletions src/manipulation/relate/dedupe.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
import { BadRequestError, UnexpectedCodePathError } from '@ehmpathy/error-fns';

import { DomainEntity } from '../../instantiation/DomainEntity';
import { DomainObject } from '../../instantiation/DomainObject';
import { UnexpectedCodePathError } from '../../utils/errors/UnexpectedCodePathError';
import { getUniqueIdentifier } from '../getUniqueIdentifier';
import { omitMetadataValues } from '../omitMetadataValues';
import { serialize } from '../serde/serialize';

// define how to get the dedupe identity key for any object
const toDedupeIdentity = <T>(obj: T) =>
serialize(obj instanceof DomainObject ? getUniqueIdentifier(obj) : obj);

const toVersionIdentity = <T>(obj: T) =>
obj instanceof DomainEntity ? serialize(omitMetadataValues(obj)) : undefined; // if not an entity, there is no version identity

/**
* a method which deduplicates objects by their identity from within an array
*
* note
* - when it operates on dobj instances, it extracts their identity via getUniqueIdentifier
* - when it operates on anything else, it simply serializes the object
*/
export const dedupe = <T>(
objs: T[],
options?: {
Expand All @@ -20,54 +35,68 @@ export const dedupe = <T>(
*/
onMultipleEntityVersions: 'FAIL_FAST' | 'CHOOSE_FIRST_OCCURRENCE';
},
): T[] =>
objs.filter((thisObj, thisIndex) => {
// determine whether this is the first occurrence of this dobj in the array
const indexOfFirstOccurrence = objs.findIndex(
(otherObj) =>
serialize(
thisObj instanceof DomainObject
? getUniqueIdentifier(thisObj)
: thisObj,
) ===
serialize(
otherObj instanceof DomainObject
? getUniqueIdentifier(otherObj)
: otherObj,
),
);
const isFirstOccurrence = indexOfFirstOccurrence === thisIndex;
): T[] => {
// track the objects we have seen
const objsSeenMetadata: Record<string, { seen: true; version?: string }> = {};

// track the ordered, deduped objs array, which we will build
const objsDedupedList: T[] = [];

// define how to check whether an obj has been seen already
const getObjSeenMetadata = (
obj: T,
): { seen: true; version?: string } | null =>
objsSeenMetadata[toDedupeIdentity(obj)] ?? null;

// define how to add a new distinct item
const addNewDistinctObj = (obj: T): void => {
// add to the objs seen lookup table
objsSeenMetadata[toDedupeIdentity(obj)] = {
seen: true,
version: toVersionIdentity(obj),
};

// if this dobj is the first occurrence, then defo not a dupe
if (isFirstOccurrence) return true;
// add to the objs deduped list
objsDedupedList.push(obj);
};

// if this dobj is not the first occurrence and it is an entity, then sanity check that there are no changes between the updatable attributes
// iterate through each object and add it to the deduped list as needed
objs.forEach((thisObj) => {
// determine if its been seen before
const prevSeenMetadata = getObjSeenMetadata(thisObj);
const hasBeenPrevSeen = prevSeenMetadata !== null;

// if it has been seen, is an entity, and the caller didn't ask to CHOOSE_FIRST_OCCURRENCE, then check whether we should fail fast
if (
hasBeenPrevSeen &&
thisObj instanceof DomainEntity &&
options?.onMultipleEntityVersions !== 'CHOOSE_FIRST_OCCURRENCE' // if they didn't explicitly ask to choose first occurrence, then check for versions
options?.onMultipleEntityVersions !== 'CHOOSE_FIRST_OCCURRENCE'
) {
const firstOccurrence = objs[indexOfFirstOccurrence];
const foundDifferentAttributes =
serialize(
firstOccurrence instanceof DomainObject
? omitMetadataValues(firstOccurrence)
: firstOccurrence,
) !==
serialize(
thisObj instanceof DomainObject
? omitMetadataValues(thisObj)
: thisObj,
);
if (foundDifferentAttributes)
const versionPrevSeen = prevSeenMetadata.version;
if (!versionPrevSeen)
throw new UnexpectedCodePathError(
`More than one version of the same entity found in the array. Can not safely dedupe, since we don't know which version should be kept.`,
'should have had prev seen metadata declared for a domain entity',
{ thisObj },
);
const versionCurrSeen = toVersionIdentity(thisObj);
if (versionCurrSeen !== versionPrevSeen)
throw new BadRequestError(
`Two different versions of the same entity were asked to be deduped. Options.onMultipleEntityVersions !== 'CHOOSE_FIRST_OCCURRENCE', so we're failing fast here, since we don't know which version should be kept.`,
{
firstOccurrence,
nextOccurrence: thisObj,
thisObj,
versionCurrSeen,
versionPrevSeen,
},
);
}

// otherwise, this is a dupe, and should be removed
return false;
// if it's been previously seen otherwise, then we can exit here as its a dupe
if (hasBeenPrevSeen) return;

// otherwise, since its not been previously seen, add it as a new distinct obj
addNewDistinctObj(thisObj);
});

// return all the distinct objs
return objsDedupedList;
};
2 changes: 1 addition & 1 deletion src/reference/isPrimaryKeyRef.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { UnexpectedCodePathError } from '@ehmpathy/error-fns';
import { isPresent } from 'type-fns';

import { UnexpectedCodePathError } from '../utils/errors/UnexpectedCodePathError';
import { DomainPrimaryKeyShape } from './DomainPrimaryKeyShape';
import { DomainObjectShape, Refable } from './DomainReferenceable';
import { DomainUniqueKeyShape } from './DomainUniqueKeyShape';
Expand Down
2 changes: 1 addition & 1 deletion src/reference/isUniqueKeyRef.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { UnexpectedCodePathError } from '@ehmpathy/error-fns';
import { isPresent } from 'type-fns';

import { UnexpectedCodePathError } from '../utils/errors/UnexpectedCodePathError';
import { DomainPrimaryKeyShape } from './DomainPrimaryKeyShape';
import { DomainObjectShape, Refable } from './DomainReferenceable';
import { DomainUniqueKeyShape } from './DomainUniqueKeyShape';
Expand Down
14 changes: 0 additions & 14 deletions src/utils/errors/UnexpectedCodePathError.ts

This file was deleted.

0 comments on commit 87faa66

Please sign in to comment.