Skip to content

Commit 4e42e6e

Browse files
Signature extraction: extract original message body without forwarded or replied content (#2277)
* Implement signature extraction feature * fixing issues * fixing issues * add email signature worker setup * add support for email signature in `email fetcher`, `task manager`, 'email extractor` * clean unused old code, refactor, fix linting * update email signature worker * update email signature pipeline * cache signature data with mining_id -> user_id:email * add extract track mining, extract phone number, update contact * clear redis data, except signature stream * add new column phone_numbers to table persons * fix formatting and linting * remove unused package & install libphonenumber-js * apply deepsource-recommendation & fix formatting * fix unittest & clean code * update env variables * add signature worker to docker compose * remove unused packages * rename phone_numbers to telephone * add contact telephone to frontend * fix phone number filter * add: ui chip, phone link, copy button * fix: use newContact instead of contact * use shared i18n * add: get original message without forwarded or email reply * fix: add u flag for regex * add: extract signature using llm * remove console logs & fix linting * update llm prompt & add new models * fix: use fallback when extracting signature * add: skip disposable emails when extracting signature * add: notify signature worker when fetching is canceled * handle forwarded messages, and email replies * fix: formatting & linting * fix: deepsource (useless template literal found ) * add: minor llm prompt update * fix: better handle fetching cancel/stop * add: push to notification table after extracting signature --------- Co-authored-by: Mohamed Aziz Hammami <mohamedazizhammami01@gmail.com>
1 parent 935bada commit 4e42e6e

File tree

14 files changed

+629
-31
lines changed

14 files changed

+629
-31
lines changed

.env.master.dev

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ REDIS_SIGNATURE_STREAM_NAME = 'email-signature-stream'
6161
REDIS_SIGNATURE_STREAM_CONSUMER_GROUP = 'email-signature-consumer-group'
6262

6363

64+
SIGNATURE_USE_LLM = false
65+
SIGNATURE_OPENROUTER_API_KEY =
66+
67+
6468
## OAUTH PROVIDERS ##
6569
GOOGLE_CLIENT_ID = 21825381029-993l33883t26n48fv11mmm049j6qn6lh.apps.googleusercontent.com # ( REQUIRED ) Google client ID
6670
GOOGLE_SECRET = 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 # ( REQUIRED ) Google secret

.env.master.prod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ REDIS_EMAIL_SIGNATURE_CONSUMER_BATCH_SIZE = 500
5757
REDIS_SIGNATURE_STREAM_NAME =
5858
REDIS_SIGNATURE_STREAM_CONSUMER_GROUP =
5959

60+
SIGNATURE_USE_LLM = false
61+
SIGNATURE_OPENROUTER_API_KEY =
62+
6063

6164
## OAUTH PROVIDERS ##
6265
GOOGLE_CLIENT_ID = # ( REQUIRED ) Google client ID

backend/src/config/schema.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ const schema = z.object({
3636
REDIS_SIGNATURE_STREAM_NAME: z.string().min(1),
3737
REDIS_SIGNATURE_STREAM_CONSUMER_GROUP: z.string().min(1),
3838

39+
SIGNATURE_USE_LLM: boolean(),
40+
SIGNATURE_OPENROUTER_API_KEY: z.string().min(1).optional(),
41+
3942
/* SUPABASE + POSTGRES */
4043
SUPABASE_PROJECT_URL: z.string().url(),
4144
SUPABASE_SECRET_PROJECT_TOKEN: z.string().min(1),

backend/src/emailSignatureWorker.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ import EmailSignatureConsumer, {
1313
} from './workers/email-signature/consumer';
1414
import RedisEmailSignatureCache from './services/cache/redis/RedisEmailSignatureCache';
1515
import supabaseClient from './utils/supabase';
16+
import { Signature } from './services/signature';
17+
import { LLMModels } from './services/signature/signature-llm';
18+
import { checkDomainStatus } from './utils/helpers/domainHelpers';
1619

1720
const redisClient = redis.getClient();
1821
const subscriberRedisClient = redis.getSubscriberClient();
@@ -21,7 +24,14 @@ const emailSignatureCache = new RedisEmailSignatureCache(redisClient);
2124

2225
const { processStreamData } = initializeEmailSignatureProcessor(
2326
supabaseClient,
24-
emailSignatureCache
27+
new Signature(logger, {
28+
model: LLMModels.DeepSeek8bFree,
29+
apiKey: ENV.SIGNATURE_OPENROUTER_API_KEY,
30+
useLLM: ENV.SIGNATURE_USE_LLM
31+
}),
32+
emailSignatureCache,
33+
checkDomainStatus,
34+
redisClient
2535
);
2636

2737
const tasksManagementSubscriber = new RedisSubscriber<PubSubMessage>(

backend/src/services/imap/ImapEmailsFetcher.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -363,9 +363,30 @@ export default class ImapEmailsFetcher {
363363
/**
364364
* Performs cleanup operations after the fetching process has finished or stopped.
365365
*/
366-
async stop() {
367-
this.isCanceled = true;
368-
await this.process;
366+
async stop(cancel: boolean) {
367+
if (cancel) {
368+
this.isCanceled = true;
369+
await this.process;
370+
await publishStreamsPipeline([
371+
{
372+
stream: this.signatureStream,
373+
data: {
374+
type: 'email',
375+
data: {
376+
header: {},
377+
body: '',
378+
seqNumber: -1,
379+
folderPath: '',
380+
isLast: true
381+
},
382+
userId: this.userId,
383+
userEmail: this.userEmail,
384+
userIdentifier: this.userIdentifier,
385+
miningId: this.miningId
386+
}
387+
}
388+
]);
389+
}
369390
await redisClient.unlink(this.processSetKey);
370391
await this.imapConnectionProvider.cleanPool(); // Do it async because it may take up to 30s to close
371392
return this.isCompleted;
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import { Logger } from 'winston';
2+
import { ExtractSignature, PersonLD } from './types';
3+
import { LLMModelType, SignatureLLM } from './signature-llm';
4+
import { SignatureRE } from './signature-regex';
5+
6+
export interface Config {
7+
useLLM: boolean;
8+
apiKey?: string;
9+
model?: LLMModelType;
10+
}
11+
12+
export class Signature implements ExtractSignature {
13+
private readonly extractor: ExtractSignature;
14+
15+
constructor(
16+
private readonly logger: Logger,
17+
{ apiKey, model, useLLM }: Config
18+
) {
19+
if (apiKey && model && useLLM) {
20+
this.logger.info(`Using LLM-based signature extractor (${model})`);
21+
this.extractor = new SignatureLLM(this.logger, model, apiKey);
22+
} else {
23+
this.logger.info('Using regex-based signature extractor');
24+
this.extractor = new SignatureRE(this.logger);
25+
}
26+
}
27+
28+
async extract(signature: string): Promise<PersonLD | null> {
29+
try {
30+
return await this.extractor.extract(signature);
31+
} catch (err) {
32+
if (this.extractor instanceof SignatureLLM) {
33+
this.logger.warn(
34+
'signature extractor LLM failed. Using fallback.',
35+
err
36+
);
37+
return await new SignatureRE(this.logger).extract(signature);
38+
}
39+
this.logger.error(
40+
`${this.extractor.constructor.name} extractor failed`,
41+
err
42+
);
43+
return null;
44+
}
45+
}
46+
}
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import { Logger } from 'winston';
2+
import { ExtractSignature, PersonLD } from './types';
3+
4+
export enum LLMModels {
5+
DeepSeek8bFree = 'deepseek/deepseek-r1-0528-qwen3-8b:free',
6+
Mistral7bFree = 'mistralai/mistral-7b-instruct:free',
7+
metaLlama8bInstructFree = 'meta-llama/llama-3.3-8b-instruct:free',
8+
googleGemma4bFree = 'google/gemma-3n-e4b-it:free',
9+
qwen7bInstructFree = 'qwen/qwen-2.5-7b-instruct:free'
10+
}
11+
12+
export type LLMModelType = `${LLMModels}`;
13+
14+
export const SignaturePrompt = {
15+
system: `<system_prompt>
16+
YOU ARE A DATA EXTRACTION AGENT THAT PARSES EMAIL SIGNATURES AND OUTPUTS CLEAN JSON IN THE schema.org "Person" FORMAT
17+
18+
###TASK###
19+
20+
EXTRACT ONLY WHAT IS EXPLICITLY PRESENT IN THE TEXT. FORMAT AND RETURN AS VALID JSON. DO NOT ADD, GUESS, OR ALTER DATA.
21+
22+
###RULES###
23+
24+
- **ONLY USE WHAT IS CLEARLY WRITTEN** — NO INFERENCE
25+
- **PRESERVE ORIGINAL SPELLING AND CAPITALIZATION**
26+
- **REMOVE SPACES FROM PHONE NUMBERS** (E.G., '+1234567890')
27+
- **CONVERT SOCIAL HANDLES TO FULL URLS IN 'sameAs'**
28+
- **OMIT FIELDS NOT FULLY PRESENT**
29+
- **OUTPUT VALID JSON ONLY** — NO MARKDOWN, NO EXPLANATION, NO TEXT
30+
31+
###REQUIRED FIELDS###
32+
33+
- '@type' (MUST be '"Person"')
34+
- 'name' (MUST be present)
35+
36+
OPTIONAL (include only if clearly found):
37+
- 'image': string (URL to an image of the person (avatar or profile picture))
38+
- 'jobTitle': string (The persons job title)
39+
- 'worksFor': string (Company or organization the person works for)
40+
- 'address': string (The persons physical address)
41+
- 'telephone': string[] (Array of the persons phone numbers (e.g., mobile, WhatsApp))
42+
- 'sameAs': string[] (Array of the persons full correct profile URLs)
43+
44+
###CHAIN OF THOUGHT###
45+
46+
1. **READ** the signature line by line
47+
2. **IDENTIFY** explicit values only (no assumptions)
48+
3. **PARSE** email, phone, socials, name, etc.
49+
4. **FORMAT** phone and links as required
50+
5. **BUILD** strict schema.org Person JSON
51+
6. **IGNORE** incomplete, unclear, or ambiguous data
52+
7. **OUTPUT** JSON only — no preamble or notes
53+
54+
###WHAT NOT TO DO###
55+
56+
- DO NOT GUESS OR FIX SPELLING/CAPITALIZATION
57+
- DO NOT OUTPUT FIELDS NOT FULLY FOUND
58+
- DO NOT ADD FAKE OR DEFAULT DATA
59+
- DO NOT USE PARTIAL SOCIAL LINKS
60+
- DO NOT OUTPUT ANYTHING BUT JSON
61+
62+
###EXAMPLE###
63+
64+
**Input:**
65+
Jane Smith
66+
Marketing Lead
67+
Bright Horizons Ltd.
68+
jane@brighthorizons.co.uk
69+
+44 7911 123456
70+
1 Sunrise Way, London
71+
Twitter: twitter.com/janesmith
72+
73+
**Output:**
74+
{
75+
"@type": "Person",
76+
"name": "Jane Smith",
77+
"jobTitle": "Marketing Lead",
78+
"worksFor": "Bright Horizons Ltd.",
79+
"email": "jane@brighthorizons.co.uk",
80+
"telephone": "+447911123456",
81+
"address": "1 Sunrise Way, London",
82+
"sameAs": ["twitter.com/janesmith"]
83+
}
84+
85+
</system_prompt>
86+
`,
87+
response_format: {
88+
type: 'json_object'
89+
},
90+
91+
buildUserPrompt: (signature: string) => `${signature}`
92+
};
93+
94+
export class SignatureLLM implements ExtractSignature {
95+
LLM_ENDPOINT = 'https://openrouter.ai/api/v1/chat/completions';
96+
97+
constructor(
98+
private readonly logger: Logger,
99+
private readonly model: LLMModelType,
100+
private readonly apiKey: string
101+
) {}
102+
103+
private headers() {
104+
return {
105+
Authorization: `Bearer ${this.apiKey}`,
106+
'Content-Type': 'application/json'
107+
};
108+
}
109+
110+
private body(signature: string) {
111+
return JSON.stringify({
112+
model: this.model,
113+
messages: [
114+
{ role: 'system', content: SignaturePrompt.system },
115+
{
116+
role: 'user',
117+
content: SignaturePrompt.buildUserPrompt(signature)
118+
}
119+
],
120+
response_format: SignaturePrompt.response_format
121+
});
122+
}
123+
124+
private async sendPrompt(signature: string): Promise<string | null> {
125+
try {
126+
const response = await fetch(this.LLM_ENDPOINT, {
127+
method: 'POST',
128+
headers: this.headers(),
129+
body: this.body(signature)
130+
});
131+
const data = await response.json();
132+
const error = data?.error?.message;
133+
if (error) throw new Error(error);
134+
return data.choices?.[0]?.message?.content;
135+
} catch (err) {
136+
this.logger.error('SignatureExtractionLLM error:', err);
137+
return null;
138+
}
139+
}
140+
141+
public async extract(signature: string): Promise<PersonLD | null> {
142+
try {
143+
const content = await this.sendPrompt(signature);
144+
145+
console.log(content);
146+
147+
if (!content) return null;
148+
149+
const person = JSON.parse(content);
150+
151+
if (person['@type'] !== 'Person') return null;
152+
153+
return {
154+
name: person.name,
155+
image: person.image,
156+
jobTitle: person.jobTitle,
157+
worksFor: person.worksFor,
158+
address: person.address ? [person.address] : [],
159+
telephone: person.telephone ?? [],
160+
sameAs: person.sameAs
161+
};
162+
} catch (err) {
163+
this.logger.error('SignatureExtractionLLM error:', err);
164+
return null;
165+
}
166+
}
167+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { Logger } from 'winston';
2+
import { findPhoneNumbersInText } from 'libphonenumber-js';
3+
import { ExtractSignature, PersonLD } from './types';
4+
5+
export const URL_X_REGEX =
6+
/(?:https?:\/\/)?(?:www\.)?(?:twitter\.com|x\.com)\/(\w{1,15})\b/g;
7+
export const URL_LINKEDIN_REGEX =
8+
/(?:https?:\/\/)?(?:[a-z]{2,3}\.)? {2}linkedin\.com\/in\/[a-zA-Z0-9\-_%]{3,100}(?:\/)?/;
9+
10+
export class SignatureRE implements ExtractSignature {
11+
LLM_ENDPOINT = 'https://openrouter.ai/api/v1/chat/completions';
12+
13+
constructor(private readonly logger: Logger) {}
14+
15+
private static getTelephone(signature: string): string[] {
16+
const telephone = findPhoneNumbersInText(signature);
17+
return telephone.map((phone) => phone.number.number);
18+
}
19+
20+
private static getSameAs(signature: string): string[] {
21+
const matches = new Set<string>();
22+
23+
for (const match of signature.matchAll(URL_LINKEDIN_REGEX)) {
24+
matches.add(`https://www.${match[1]}`);
25+
}
26+
27+
for (const match of signature.matchAll(URL_X_REGEX)) {
28+
matches.add(`https://x.com/${match[1]}`);
29+
}
30+
31+
return [...matches];
32+
}
33+
34+
public async extract(signature: string): Promise<PersonLD | null> {
35+
try {
36+
return {
37+
name: '',
38+
telephone: SignatureRE.getTelephone(signature),
39+
sameAs: SignatureRE.getSameAs(signature),
40+
image: undefined,
41+
jobTitle: undefined,
42+
worksFor: undefined,
43+
address: undefined
44+
};
45+
} catch (err) {
46+
this.logger.error('SignatureExtractionLLM error:', err);
47+
return null;
48+
}
49+
}
50+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
export interface PersonLD {
2+
name: string;
3+
image?: string;
4+
jobTitle?: string;
5+
worksFor?: string;
6+
address?: string[];
7+
telephone?: string[];
8+
sameAs?: string[];
9+
}
10+
11+
export interface ExtractSignature {
12+
extract(signature: string): Promise<PersonLD | null>;
13+
}

backend/src/services/tasks-manager/TasksManager.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,9 @@ export default class TasksManager {
355355
task.status = canceled ? TaskStatus.Canceled : TaskStatus.Done;
356356

357357
if (task.type === 'fetch') {
358-
await (task as TaskFetch).instance.stop();
358+
await (task as TaskFetch).instance.stop(
359+
TaskStatus.Canceled === 'canceled'
360+
);
359361
}
360362

361363
await this.pubsubSendMessage(

0 commit comments

Comments
 (0)