Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion docs/ref/checks/pii.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,45 @@ Detects personally identifiable information (PII) such as SSNs, phone numbers, c

### Parameters

- **`entities`** (required): List of PII entity types to detect. See the `PIIEntity` enum in `src/checks/pii.ts` for the full list, including custom entities such as `CVV` (credit card security codes) and `BIC_SWIFT` (bank identification codes).
- **`entities`** (optional): List of PII entity types to detect. Defaults to all entities except `NRP` and `PERSON` (see note below). See the `PIIEntity` enum in `src/checks/pii.ts` for the full list, including custom entities such as `CVV` (credit card security codes) and `BIC_SWIFT` (bank identification codes).
- **`block`** (optional): Whether to block content or just mask PII (default: `false`)
- **`detect_encoded_pii`** (optional): If `true`, detects PII in Base64/URL-encoded/hex strings (default: `false`)

### Important: NRP and PERSON Entity Deprecation

**As of v0.1.8**, the `NRP` and `PERSON` entities have been **removed from the default entity list** due to their high false positive rates. These patterns are overly broad and cause issues in production:

- **`NRP`** matches any two consecutive words (e.g., "nuevo cliente", "crea un", "the user")
- **`PERSON`** matches any two capitalized words (e.g., "New York", "The User", "European Union")

**Impact:**
- ❌ Causes false positives in natural language conversation
- ❌ Particularly problematic for non-English languages (Spanish, French, etc.)
- ❌ Breaks normal text in pre-flight masking mode

**Migration Path:**

If you need to detect person names or national registration numbers, consider these alternatives:

1. **For National Registration Numbers**: Use region-specific patterns instead:
- `SG_NRIC_FIN` (Singapore)
- `UK_NINO` (UK National Insurance Number)
- `FI_PERSONAL_IDENTITY_CODE` (Finland)
- `KR_RRN` (Korea Resident Registration Number)

2. **For Person Names**: Consider using a dedicated NER (Named Entity Recognition) service or LLM-based detection for more accurate results.

3. **If you still need these patterns**: You can explicitly include them in your configuration, but be aware of the false positives:
```json
{
"entities": ["NRP", "PERSON", "EMAIL_ADDRESS"],
"block": false
}
```
A deprecation warning will be logged when these entities are used.

**Reference:** [Issue #47](https://github.com/openai/openai-guardrails-js/issues/47)

## Implementation Notes

Under the hood the TypeScript guardrail normalizes text (Unicode NFKC), strips zero-width characters, and runs curated regex patterns for each configured entity. When `detect_encoded_pii` is enabled the check also decodes Base64, URL-encoded, and hexadecimal substrings before rescanning them for matches, remapping any findings back to the original encoded content.
Expand Down
136 changes: 134 additions & 2 deletions src/__tests__/unit/checks/pii.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
* Unit tests for the PII guardrail functionality.
*/

import { describe, it, expect } from 'vitest';
import { pii, PIIConfig, PIIEntity } from '../../../checks/pii';
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import { pii, PIIConfig, PIIEntity, _clearDeprecationWarnings } from '../../../checks/pii';

describe('pii guardrail', () => {
it('masks detected PII when block=false', async () => {
Expand Down Expand Up @@ -286,4 +286,136 @@ describe('pii guardrail', () => {
);
expect(result.info?.checked_text).toBe('Ship to <LOCATION> for delivery.');
});

describe('NRP and PERSON deprecation (Issue #47)', () => {
beforeEach(() => {
// Clear deprecation warnings before each test to ensure clean state
_clearDeprecationWarnings();
});

it('excludes NRP and PERSON from default entities', () => {
const config = PIIConfig.parse({});

expect(config.entities).not.toContain(PIIEntity.NRP);
expect(config.entities).not.toContain(PIIEntity.PERSON);
});

it('does not mask common two-word phrases when using defaults', async () => {
const config = PIIConfig.parse({
block: false,
});
const text = 'crea un nuevo cliente con email [email protected]';

const result = await pii({}, text, config);

// Should only mask the email, not "crea un" or "nuevo cliente"
expect(result.info?.checked_text).toBe('crea un nuevo cliente con email <EMAIL_ADDRESS>');
expect((result.info?.detected_entities as Record<string, string[]>)?.NRP).toBeUndefined();
});

it('does not mask capitalized phrases when using defaults', async () => {
const config = PIIConfig.parse({
block: false,
});
const text = 'Welcome to New York, The User can access the system.';

const result = await pii({}, text, config);

// Should not mask "New York" or "The User"
expect(result.info?.checked_text).toBe('Welcome to New York, The User can access the system.');
expect((result.info?.detected_entities as Record<string, string[]>)?.PERSON).toBeUndefined();
});

it('still detects NRP when explicitly configured', async () => {
const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});

const config = PIIConfig.parse({
entities: [PIIEntity.NRP],
block: false,
});
const text = 'hello world';

const result = await pii({}, text, config);

expect((result.info?.detected_entities as Record<string, string[]>)?.NRP).toEqual(['hello world']);
expect(result.info?.checked_text).toBe('<NRP>');

consoleWarnSpy.mockRestore();
});

it('still detects PERSON when explicitly configured', async () => {
const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});

const config = PIIConfig.parse({
entities: [PIIEntity.PERSON],
block: false,
});
const text = 'John Smith lives in New York';

const result = await pii({}, text, config);

expect((result.info?.detected_entities as Record<string, string[]>)?.PERSON).toContain('John Smith');
expect((result.info?.detected_entities as Record<string, string[]>)?.PERSON).toContain('New York');

consoleWarnSpy.mockRestore();
});

it('shows deprecation warning for NRP', async () => {
const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});

const config = PIIConfig.parse({
entities: [PIIEntity.NRP],
block: false,
});

await pii({}, 'test data', config);

expect(consoleWarnSpy).toHaveBeenCalledWith(
expect.stringContaining('DEPRECATION WARNING: PIIEntity.NRP')
);
expect(consoleWarnSpy).toHaveBeenCalledWith(
expect.stringContaining('https://github.com/openai/openai-guardrails-js/issues/47')
);

consoleWarnSpy.mockRestore();
});

it('shows deprecation warning for PERSON', async () => {
const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});

const config = PIIConfig.parse({
entities: [PIIEntity.PERSON],
block: false,
});

await pii({}, 'test data', config);

expect(consoleWarnSpy).toHaveBeenCalledWith(
expect.stringContaining('DEPRECATION WARNING: PIIEntity.PERSON')
);
expect(consoleWarnSpy).toHaveBeenCalledWith(
expect.stringContaining('https://github.com/openai/openai-guardrails-js/issues/47')
);

consoleWarnSpy.mockRestore();
});

it('only shows deprecation warning once per entity', async () => {
const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});

const config = PIIConfig.parse({
entities: [PIIEntity.NRP, PIIEntity.PERSON],
block: false,
});

await pii({}, 'test data', config);
await pii({}, 'more test data', config);
await pii({}, 'even more data', config);

// Should only be called once for each entity (2 total)
expect(consoleWarnSpy).toHaveBeenCalledTimes(2);

consoleWarnSpy.mockRestore();
});
});
});
65 changes: 64 additions & 1 deletion src/checks/pii.ts
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,24 @@ export enum PIIEntity {
*
* Used to control which entity types are checked and the behavior mode.
*/
/**
* Default PII entities to check.
*
* **IMPORTANT:** NRP and PERSON are excluded from defaults due to high false positive rates.
* These patterns match overly broad text patterns:
* - NRP: Matches any two consecutive words (e.g., "nuevo cliente", "crea un")
* - PERSON: Matches any two capitalized words (e.g., "New York", "The User")
*
* If you need to detect person names or national registration numbers, explicitly
* include these entities in your configuration, or use more specific region-based
* patterns like SG_NRIC_FIN, UK_NINO, etc.
*/
const DEFAULT_PII_ENTITIES = Object.values(PIIEntity).filter(
(entity) => entity !== PIIEntity.NRP && entity !== PIIEntity.PERSON
);

export const PIIConfig = z.object({
entities: z.array(z.nativeEnum(PIIEntity)).default(() => Object.values(PIIEntity)),
entities: z.array(z.nativeEnum(PIIEntity)).default(() => DEFAULT_PII_ENTITIES),
block: z
.boolean()
.default(false)
Expand Down Expand Up @@ -844,6 +860,50 @@ function _asResult(
};
}

/**
* Deprecated PII entities that have high false positive rates.
*/
const DEPRECATED_ENTITIES = new Set([PIIEntity.NRP, PIIEntity.PERSON]);

/**
* Track which deprecation warnings have been shown to avoid spam.
*/
const shownDeprecationWarnings = new Set<string>();

/**
* Clear deprecation warning cache. FOR TESTING ONLY.
* @internal
*/
export function _clearDeprecationWarnings(): void {
shownDeprecationWarnings.clear();
}

/**
* Warn users about deprecated PII entities with high false positive rates.
*
* @param entities The list of entities being checked
*/
function _warnDeprecatedEntities(entities: PIIEntity[]): void {
const deprecated = entities.filter((entity) => DEPRECATED_ENTITIES.has(entity));

for (const entity of deprecated) {
if (shownDeprecationWarnings.has(entity)) {
continue;
}

shownDeprecationWarnings.add(entity);

console.warn(
`[openai-guardrails-js] DEPRECATION WARNING: PIIEntity.${entity} has been removed from default entities due to high false positive rates.\n` +
` - ${entity === PIIEntity.NRP ? 'NRP matches any two consecutive words (e.g., "nuevo cliente", "crea un")' : 'PERSON matches any two capitalized words (e.g., "New York", "The User")'}\n` +
` - This pattern causes false positives in normal conversation, especially in non-English languages.\n` +
` - Consider using more specific region-based patterns like SG_NRIC_FIN, UK_NINO, etc.\n` +
` - To suppress this warning, remove PIIEntity.${entity} from your entities configuration.\n` +
` - See: https://github.com/openai/openai-guardrails-js/issues/47`
);
}
}

/**
* Async guardrail check_fn for PII entity detection in text.
*
Expand All @@ -861,6 +921,9 @@ export const pii: CheckFn<Record<string, unknown>, string, PIIConfig> = async (
data,
config
): Promise<GuardrailResult> => {
// Warn about deprecated entities
_warnDeprecatedEntities(config.entities);

const result = _detectPii(data, config);
return _asResult(result, config, 'Contains PII', data);
};
Expand Down