PoliGrapher-LM/annotator.py at master · UCI-Networking-Group/PoliGrapher-LM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
#!/usr/bin/env python3
'''LLM-based annotator module'''

import argparse
import json
import logging
import os
import textwrap
from typing import Generator

import litellm
import rapidjson
import regex
from litellm import completion
from rapidfuzz import fuzz
from lemminflect import getAllLemmas, getAllInflections, getAllInflectionsOOV  # type: ignore

from type_hints import DocumentChunkInfo, DocumentJson, PhraseIdentifier, LlmStatement, Statement

litellm.enable_cache(type="disk")

PROMPT = '''
### Instructions

Analyze the user-provided privacy policy excerpt and extract information about personal data processing.

Return a list of JSON objects, each with the following keys:
- action: List[str] -- List of actions applied to the personal data. For example: "collect", "share", "use".
- data: List[str] -- List of personal data types that are processed. For example: "email address", "mac address", and broader terms like "personal data", "contact info".
- processor: List[str] -- List of entities that process the personal data. For example: "we" (the first party), "our third-party partners", or specific company names.
- recipient: List[str] -- List of entities that receive personal data, when the action involves data transfer. Same examples as for "processor".
- purpose: List[str] -- List of purposes for which the personal data is processed. For example: "authentication", "to provide services".
- context: List[str] -- Other conditions associated with personal data processing. For example: "if you register an account", "when you use our services".
- prohibition: bool -- Specially, if the statement denies or prohibits the stated action (for example, "we DO NOT collect..."), include this key and set it to true.

Notes:
- Ensure that the string values are extracted exactly from the text, preserving the original wording.
- The information to extract may spread across multiple sentences. Make sure to analyze the entire excerpt.
- Omit any of the keys if the corresponding information is not present in the text.
- Only include affirmative and negative statements concerning personal data processing. Ignore other types of statements.
- Return a list of JSON objects, one for each relevant statement found in the excerpt. If there are no relevant statements, simply return an empty list `[]`.

### Examples

Input 1:
> When you create an account, or when you contact us, we may collect a variety of information,
> including your name, mailing address, contact preferences, and credit card information.

Output 1:
[
  {
    "action": ["collect"],
    "processor": ["we"],
    "data": ["name", "mailing address", "contact preferences", "credit card information"],
    "context": ["When you create an account", "when you contact us"]
  }
]

Input 2:
> Here are the types of personal information we collect:
> * Identity Information: such as your user identification number.
> * Contact Information: such as your email address and telephone number.
> We will never share these data with third parties.

Output 2:
[
  {
    "action": ["collect"],
    "processor": ["we"],
    "data": ["Identity Information", "user identification number", "Contact Information", "email address", "telephone number"]
  },
  {
    "action": ["share"],
    "processor": ["We"],
    "recipient": ["third parties"],
    "data": ["Identity Information", "user identification number", "Contact Information", "email address", "telephone number"],
    "prohibition": true
  }
]

Input 3:

> We may share your personal information with CompanyX.
> CompanyX uses your personal information to operate, provide, and improve the products that we offer.
> These purposes include: Purchase and delivery of products.

Output 3:
[
  {
    "action": ["share"],
    "processor": ["We"],
    "recipient": ["CompanyX"],
    "data": ["personal information"],
  },
  {
    "action": ["uses"],
    "processor": ["CompanyX"],
    "data": ["personal information"],
    "purpose": ["to operate, provide, and improve the products that we offer", "Purchase and delivery of products"]
  }
]

Input 4:

> As required by law, we will never disclose sensitive personal information to third parties without your explicit consent.
> When you use third party services, including cloud services and customer service providers, they may share information about that usage with us.

Output 4:
[
  {
    "action": ["disclose"],
    "processor": ["We"],
    "recipient": ["third parties"],
    "data": ["sensitive personal information"],
    "context": ["As required by law", "without your explicit consent"],
    "prohibition": true
  },
  {
    "action": ["share"],
    "processor": ["third party services", "cloud services", "customer service providers"],
    "recipient": ["us"],
    "data": ["information about that usage"],
    "context": ["When you use third party services, including cloud services and customer service providers"]
  }
]

Input 5:
> You have the right to access, update, and correct inaccuracies in your personal information in our custody.
> However, you may not disable certain types of data processing.

Output 5:
[]
'''


class Annotator:
    '''LLM-based annotator -- main runner'''

    def __init__(self, model_name: str):
        self.model_name = model_name
        self.reflection_rounds = 3
        self.error_retries = 3

    def run(self, doc: DocumentJson) -> Generator[tuple[int, Statement], None, None]:
        seen_statements = set()

        for i, chunk in enumerate(doc["chunks"]):
            statements: list[Statement] = []

            for i in range(self.reflection_rounds):
                if statements and self._check_if_exhausted(chunk, statements):
                    logging.info("Exhausted. %d statements found", len(statements))
                    break

                logging.info("Try %d", i + 1)

                new_statements = self._llm_extract(chunk, statements)
                logging.info("%d new statements", len(new_statements))

                if len(new_statements) == 0:
                    break

                statements.extend(new_statements)

            for st in statements:
                # Avoid duplicated statements
                st_key = json.dumps(st, sort_keys=True)

                if st_key in seen_statements:
                    continue

                seen_statements.add(st_key)

                yield i, st

    def _llm_extract(self, chunk: DocumentChunkInfo, current_statements: list[Statement]) -> list[Statement]:
        text = chunk["text"]

        for i_retry in range(self.error_retries):
            messages = [
                {"role": "system", "content": PROMPT},
                {"role": "user", "content": f"### INPUT\n\n{text}"},
            ]

            if len(current_statements) > 0:
                messages.extend([
                    {
                        "role": "assistant",
                        "content": convert_statements_to_llm_input(current_statements),
                    },
                    {
                        "role": "user",
                        "content": "Some statements were missed in the last extraction. Please continue.",
                    },
                ])

            response = completion(
                model=self.model_name,
                messages=messages,
                caching=i_retry == 0,
            )
            raw_message = response.choices[0].message.content
            logging.info("GPT response: %r", raw_message)

            try:
                new_statements = extract_json_list(raw_message)
            except (rapidjson.JSONDecodeError, ValueError):
                logging.error("Failed to decode JSON response")
                continue

            fixed_statements = []

            for statement in new_statements:
                if st := validate_and_fix_statement(chunk, statement):
                    fixed_statements.append(st)

            return fixed_statements

        return []

    def _check_if_exhausted(self, chunk: DocumentChunkInfo, current_statements: list[Statement]) -> bool:
        text = chunk["text"]

        messages = [
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": f"### INPUT\n\n{text}"},
        ]

        messages.extend([
            {
                "role": "assistant",
                "content": convert_statements_to_llm_input(current_statements),
            },
            {
                "role": "user",
                "content": "Are there still more statements to be added? Answer 'YES' or 'NO'.",
            },
        ])

        logging.info('Querying if the answers are exhausted...')

        response = completion(
            model=self.model_name,
            messages=messages,
        )
        raw_message = response.choices[0].message.content

        logging.info('Response: %r', raw_message)

        return 'YES' not in raw_message


def get_inflections(word: str) -> set[str]:
    '''Get all possible inflections of a word'''

    results = set()

    for upos, lemmas in getAllLemmas(word).items():
        for lemma in lemmas:
            results.add(lemma)

            for inflection_list in getAllInflections(lemma, upos).values():
                results.update(inflection_list)

    if not results:
        for inflection_list in getAllInflectionsOOV(word, 'NOUN').values():
            results.update(inflection_list)

    results.add(word)

    return results


def extract_json_list(text: str) -> list[LlmStatement]:
    '''Find a JSON list of LLM-format statements in the text and load it'''
    json_body = text[text.index('['):text.rindex(']') + 1]
    return rapidjson.loads(json_body, parse_mode=rapidjson.PM_COMMENTS | rapidjson.PM_TRAILING_COMMAS)


def fuzzy_finditer(text: str, value: str) -> list[regex.Match]:
    '''
    Fuzzy search for a value in the text.
    In case of multiple matches, return the best ones first.
    '''
    re_patterns = []

    for item in regex.finditer(r'(\w+)|(\s+)|([^\w\s]+)', value):
        if m := item.group(1):
            # Word
            inflections = map(regex.escape, sorted(get_inflections(m), key=len, reverse=True))
            re_patterns.append(rf'(?:{"|".join(inflections)})')
        elif m := item.group(2):
            # Space
            re_patterns.append(r'\W+')
        elif m := item.group(3):
            # Other characters
            re_patterns.append(rf'(?:{regex.escape(m)})?')

    return sorted(
        regex.finditer(r'\W*'.join(re_patterns), text, regex.IGNORECASE),
        key=lambda m: -fuzz.ratio(value, m[0]),
    )


def convert_statements_to_llm_input(statements: list[Statement]) -> str:
    '''
    Convert a list of statements to a JSON string for LLM input.
    Note that LLM-format statements do not use PhraseIdentifier.
    '''
    serialized_statements = []

    for st in statements:
        llm_st: dict[str, bool | list[str]] = {}

        for key, value in st.items():
            if isinstance(value, bool):
                llm_st[key] = value
            elif isinstance(value, list):
                llm_st[key] = [i[1] for i in value]

        json_string = rapidjson.dumps(llm_st, indent=2, write_mode=rapidjson.WM_SINGLE_LINE_ARRAY)
        serialized_statements.append(textwrap.indent(json_string, '  '))

    return '[\n' + ',\n'.join(serialized_statements) + '\n]'


def match_parameters_to_blocks(chunk: DocumentChunkInfo, statement: dict) -> Statement:
    '''
    Match each statement parameter to a block in the chunk.
    And fix minor inconsistencies in the quoted text.
    '''
    text = chunk["text"]
    block_map = chunk["block_map"]

    # For each value in the statement, find candidate blocks that contain it
    assign_candidates: dict[str, dict[int, str]] = {}
    _unique_blocks: set[int] = set()

    for value_list in statement.values():
        if not isinstance(value_list, list):
            continue

        for value in value_list:
            if value in assign_candidates or not value:
                continue

            for match in fuzzy_finditer(text, value):
                v_start, v_end = match.span()
                matched_str = text[v_start:v_end]

                for item in block_map:
                    i_start, i_end = item["text_range"]
                    block_idx = item["index"]

                    if i_start <= v_start < v_end <= i_end:
                        assign_candidates.setdefault(value, {}).setdefault(block_idx, matched_str)
                        _unique_blocks.add(block_idx)

    # Find the smallest range of blocks that contain all values in the statement
    unique_blocks = sorted(_unique_blocks)
    best_match_range = len(unique_blocks) + 1
    best_matches: dict[str, PhraseIdentifier] = {}

    for i in range(len(unique_blocks)):
        matches: dict[str, PhraseIdentifier] = {}

        for j in range(i + 1, len(unique_blocks) + 1):
            block_idx = unique_blocks[j - 1]

            for value, candidate_matches in assign_candidates.items():
                if block_idx in candidate_matches:
                    matches[value] = (block_idx, candidate_matches[block_idx])

            if len(matches) == len(assign_candidates):
                if j - i < best_match_range:
                    best_match_range = j - i
                    best_matches = matches

                break

    # Transform the statement to include block indices
    transformed_statement: dict[str, bool | list[PhraseIdentifier]] = {}

    for key, value_list in statement.items():
        if isinstance(value_list, bool):
            transformed_statement[key] = value_list
        elif isinstance(value_list, list):
            for value in value_list:
                if value in best_matches:
                    phrase_id = best_matches[value]
                    transformed_statement.setdefault(key, []).append(phrase_id)  # type: ignore

                    if phrase_id[1] != value:
                        logging.info("Value %r matched to %r", value, phrase_id)
                else:
                    logging.warning("Value %r not found in any blocks", value)

    return transformed_statement  # type: ignore


def validate_and_fix_statement(chunk: DocumentChunkInfo, statement: LlmStatement) -> Statement | None:
    '''Main "reflection" code -- validate and fix the statement'''

    str_list_keys = ["action", "processor", "recipient", "data", "purpose", "context"]
    bool_keys = ["prohibition"]
    core_keys = ["action", "data"]  # Mandatory keys

    fixed_statement: dict = {}

    for key in str_list_keys:
        value = statement.get(key)

        if isinstance(value, list) and value:
            fixed_statement[key] = value
        elif isinstance(value, str):
            # fix: single value instead of list
            fixed_statement[key] = [value]

    for key in bool_keys:
        value = statement.get(key)

        if isinstance(value, bool) and value:
            fixed_statement[key] = True
        elif isinstance(value, str) and value.lower() in ["true", "yes"]:
            # fix: string instead of boolean
            fixed_statement[key] = True

    fixed_statement2 = match_parameters_to_blocks(chunk, fixed_statement)

    for key in core_keys:
        if key not in fixed_statement2:
            logging.error("Key %r not found in the text", key)
            return None

    return fixed_statement2


def main():
    logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument("workdirs", nargs="+", help="Input directories")
    parser.add_argument("--model-name", default="gpt-4o-mini", help="Model name")
    args = parser.parse_args()

    annotator = Annotator(args.model_name)

    for d in args.workdirs:
        logging.info("Processing %s ...", d)

        with open(os.path.join(d, 'document.json'), 'rb') as fin:
            doc = json.load(fin)

        with open(os.path.join(d, 'policy_statements.jsonl'), 'w', encoding='utf-8') as fout:
            for chunk_index, statement in annotator.run(doc):
                print(json.dumps({"chunk_index": chunk_index, "statement": statement}), file=fout)

if __name__ == "__main__":
    main()