From 65239e7e0f76c945d09458453038f37a5de42d66 Mon Sep 17 00:00:00 2001 From: srivatsan0611 Date: Tue, 25 Nov 2025 21:16:25 +0530 Subject: [PATCH 1/3] fix: Remove NRP and PERSON from default PII entities --- src/checks/pii.ts | 65 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/src/checks/pii.ts b/src/checks/pii.ts index 5527d63..5143821 100644 --- a/src/checks/pii.ts +++ b/src/checks/pii.ts @@ -145,8 +145,24 @@ export enum PIIEntity { * * Used to control which entity types are checked and the behavior mode. */ +/** + * Default PII entities to check. + * + * **IMPORTANT:** NRP and PERSON are excluded from defaults due to high false positive rates. + * These patterns match overly broad text patterns: + * - NRP: Matches any two consecutive words (e.g., "nuevo cliente", "crea un") + * - PERSON: Matches any two capitalized words (e.g., "New York", "The User") + * + * If you need to detect person names or national registration numbers, explicitly + * include these entities in your configuration, or use more specific region-based + * patterns like SG_NRIC_FIN, UK_NINO, etc. + */ +const DEFAULT_PII_ENTITIES = Object.values(PIIEntity).filter( + (entity) => entity !== PIIEntity.NRP && entity !== PIIEntity.PERSON +); + export const PIIConfig = z.object({ - entities: z.array(z.nativeEnum(PIIEntity)).default(() => Object.values(PIIEntity)), + entities: z.array(z.nativeEnum(PIIEntity)).default(() => DEFAULT_PII_ENTITIES), block: z .boolean() .default(false) @@ -844,6 +860,50 @@ function _asResult( }; } +/** + * Deprecated PII entities that have high false positive rates. + */ +const DEPRECATED_ENTITIES = new Set([PIIEntity.NRP, PIIEntity.PERSON]); + +/** + * Track which deprecation warnings have been shown to avoid spam. + */ +const shownDeprecationWarnings = new Set(); + +/** + * Clear deprecation warning cache. FOR TESTING ONLY. + * @internal + */ +export function _clearDeprecationWarnings(): void { + shownDeprecationWarnings.clear(); +} + +/** + * Warn users about deprecated PII entities with high false positive rates. + * + * @param entities The list of entities being checked + */ +function _warnDeprecatedEntities(entities: PIIEntity[]): void { + const deprecated = entities.filter((entity) => DEPRECATED_ENTITIES.has(entity)); + + for (const entity of deprecated) { + if (shownDeprecationWarnings.has(entity)) { + continue; + } + + shownDeprecationWarnings.add(entity); + + console.warn( + `[openai-guardrails-js] DEPRECATION WARNING: PIIEntity.${entity} has been removed from default entities due to high false positive rates.\n` + + ` - ${entity === PIIEntity.NRP ? 'NRP matches any two consecutive words (e.g., "nuevo cliente", "crea un")' : 'PERSON matches any two capitalized words (e.g., "New York", "The User")'}\n` + + ` - This pattern causes false positives in normal conversation, especially in non-English languages.\n` + + ` - Consider using more specific region-based patterns like SG_NRIC_FIN, UK_NINO, etc.\n` + + ` - To suppress this warning, remove PIIEntity.${entity} from your entities configuration.\n` + + ` - See: https://github.com/openai/openai-guardrails-js/issues/47` + ); + } +} + /** * Async guardrail check_fn for PII entity detection in text. * @@ -861,6 +921,9 @@ export const pii: CheckFn, string, PIIConfig> = async ( data, config ): Promise => { + // Warn about deprecated entities + _warnDeprecatedEntities(config.entities); + const result = _detectPii(data, config); return _asResult(result, config, 'Contains PII', data); }; From 5a1e171c2a10a9d8fb9642c8fc83db81e71d7401 Mon Sep 17 00:00:00 2001 From: srivatsan0611 Date: Tue, 25 Nov 2025 21:16:51 +0530 Subject: [PATCH 2/3] test: Add tests for NRP/PERSON deprecation --- src/__tests__/unit/checks/pii.test.ts | 136 +++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 2 deletions(-) diff --git a/src/__tests__/unit/checks/pii.test.ts b/src/__tests__/unit/checks/pii.test.ts index 24ec246..0a32ad6 100644 --- a/src/__tests__/unit/checks/pii.test.ts +++ b/src/__tests__/unit/checks/pii.test.ts @@ -2,8 +2,8 @@ * Unit tests for the PII guardrail functionality. */ -import { describe, it, expect } from 'vitest'; -import { pii, PIIConfig, PIIEntity } from '../../../checks/pii'; +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { pii, PIIConfig, PIIEntity, _clearDeprecationWarnings } from '../../../checks/pii'; describe('pii guardrail', () => { it('masks detected PII when block=false', async () => { @@ -286,4 +286,136 @@ describe('pii guardrail', () => { ); expect(result.info?.checked_text).toBe('Ship to for delivery.'); }); + + describe('NRP and PERSON deprecation (Issue #47)', () => { + beforeEach(() => { + // Clear deprecation warnings before each test to ensure clean state + _clearDeprecationWarnings(); + }); + + it('excludes NRP and PERSON from default entities', () => { + const config = PIIConfig.parse({}); + + expect(config.entities).not.toContain(PIIEntity.NRP); + expect(config.entities).not.toContain(PIIEntity.PERSON); + }); + + it('does not mask common two-word phrases when using defaults', async () => { + const config = PIIConfig.parse({ + block: false, + }); + const text = 'crea un nuevo cliente con email test@gmail.com'; + + const result = await pii({}, text, config); + + // Should only mask the email, not "crea un" or "nuevo cliente" + expect(result.info?.checked_text).toBe('crea un nuevo cliente con email '); + expect((result.info?.detected_entities as Record)?.NRP).toBeUndefined(); + }); + + it('does not mask capitalized phrases when using defaults', async () => { + const config = PIIConfig.parse({ + block: false, + }); + const text = 'Welcome to New York, The User can access the system.'; + + const result = await pii({}, text, config); + + // Should not mask "New York" or "The User" + expect(result.info?.checked_text).toBe('Welcome to New York, The User can access the system.'); + expect((result.info?.detected_entities as Record)?.PERSON).toBeUndefined(); + }); + + it('still detects NRP when explicitly configured', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.NRP], + block: false, + }); + const text = 'hello world'; + + const result = await pii({}, text, config); + + expect((result.info?.detected_entities as Record)?.NRP).toEqual(['hello world']); + expect(result.info?.checked_text).toBe(''); + + consoleWarnSpy.mockRestore(); + }); + + it('still detects PERSON when explicitly configured', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.PERSON], + block: false, + }); + const text = 'John Smith lives in New York'; + + const result = await pii({}, text, config); + + expect((result.info?.detected_entities as Record)?.PERSON).toContain('John Smith'); + expect((result.info?.detected_entities as Record)?.PERSON).toContain('New York'); + + consoleWarnSpy.mockRestore(); + }); + + it('shows deprecation warning for NRP', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.NRP], + block: false, + }); + + await pii({}, 'test data', config); + + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('DEPRECATION WARNING: PIIEntity.NRP') + ); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('https://github.com/openai/openai-guardrails-js/issues/47') + ); + + consoleWarnSpy.mockRestore(); + }); + + it('shows deprecation warning for PERSON', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.PERSON], + block: false, + }); + + await pii({}, 'test data', config); + + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('DEPRECATION WARNING: PIIEntity.PERSON') + ); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('https://github.com/openai/openai-guardrails-js/issues/47') + ); + + consoleWarnSpy.mockRestore(); + }); + + it('only shows deprecation warning once per entity', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.NRP, PIIEntity.PERSON], + block: false, + }); + + await pii({}, 'test data', config); + await pii({}, 'more test data', config); + await pii({}, 'even more data', config); + + // Should only be called once for each entity (2 total) + expect(consoleWarnSpy).toHaveBeenCalledTimes(2); + + consoleWarnSpy.mockRestore(); + }); + }); }); From 7ad9d85298a3619b03f3a84a092266e65ffbf8d7 Mon Sep 17 00:00:00 2001 From: srivatsan0611 Date: Tue, 25 Nov 2025 21:17:14 +0530 Subject: [PATCH 3/3] docs: Document NRP/PERSON deprecation --- docs/ref/checks/pii.md | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/docs/ref/checks/pii.md b/docs/ref/checks/pii.md index 36b8a93..004ab20 100644 --- a/docs/ref/checks/pii.md +++ b/docs/ref/checks/pii.md @@ -24,10 +24,45 @@ Detects personally identifiable information (PII) such as SSNs, phone numbers, c ### Parameters -- **`entities`** (required): List of PII entity types to detect. See the `PIIEntity` enum in `src/checks/pii.ts` for the full list, including custom entities such as `CVV` (credit card security codes) and `BIC_SWIFT` (bank identification codes). +- **`entities`** (optional): List of PII entity types to detect. Defaults to all entities except `NRP` and `PERSON` (see note below). See the `PIIEntity` enum in `src/checks/pii.ts` for the full list, including custom entities such as `CVV` (credit card security codes) and `BIC_SWIFT` (bank identification codes). - **`block`** (optional): Whether to block content or just mask PII (default: `false`) - **`detect_encoded_pii`** (optional): If `true`, detects PII in Base64/URL-encoded/hex strings (default: `false`) +### Important: NRP and PERSON Entity Deprecation + +**As of v0.1.8**, the `NRP` and `PERSON` entities have been **removed from the default entity list** due to their high false positive rates. These patterns are overly broad and cause issues in production: + +- **`NRP`** matches any two consecutive words (e.g., "nuevo cliente", "crea un", "the user") +- **`PERSON`** matches any two capitalized words (e.g., "New York", "The User", "European Union") + +**Impact:** +- ❌ Causes false positives in natural language conversation +- ❌ Particularly problematic for non-English languages (Spanish, French, etc.) +- ❌ Breaks normal text in pre-flight masking mode + +**Migration Path:** + +If you need to detect person names or national registration numbers, consider these alternatives: + +1. **For National Registration Numbers**: Use region-specific patterns instead: + - `SG_NRIC_FIN` (Singapore) + - `UK_NINO` (UK National Insurance Number) + - `FI_PERSONAL_IDENTITY_CODE` (Finland) + - `KR_RRN` (Korea Resident Registration Number) + +2. **For Person Names**: Consider using a dedicated NER (Named Entity Recognition) service or LLM-based detection for more accurate results. + +3. **If you still need these patterns**: You can explicitly include them in your configuration, but be aware of the false positives: + ```json + { + "entities": ["NRP", "PERSON", "EMAIL_ADDRESS"], + "block": false + } + ``` + A deprecation warning will be logged when these entities are used. + +**Reference:** [Issue #47](https://github.com/openai/openai-guardrails-js/issues/47) + ## Implementation Notes Under the hood the TypeScript guardrail normalizes text (Unicode NFKC), strips zero-width characters, and runs curated regex patterns for each configured entity. When `detect_encoded_pii` is enabled the check also decodes Base64, URL-encoded, and hexadecimal substrings before rescanning them for matches, remapping any findings back to the original encoded content.