-
Notifications
You must be signed in to change notification settings - Fork 58
Add unique prefix - increasing counter #217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
c2c72d7
433d39c
eb1f241
c3700a2
230e689
b3aaa94
7f35220
54138aa
150076d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,6 +138,8 @@ def __init__( | |
self.text_creator = EndlessTextCreator( | ||
data=config.source, | ||
) | ||
# Add counter for unique prefixes | ||
self.request_counter = 0 | ||
|
||
def __iter__( | ||
self, | ||
|
@@ -170,22 +172,46 @@ def __iter__( | |
output_tokens_sampler, | ||
): | ||
start_index = rand.randint(0, len(self.text_creator.words)) | ||
# Increment counter for each request | ||
self.request_counter += 1 | ||
yield { | ||
"prompt": self._create_prompt(prompt_tokens, start_index), | ||
"prompt": self._create_prompt( | ||
prompt_tokens, start_index, self.request_counter | ||
), | ||
"prompt_tokens_count": prompt_tokens, | ||
"output_tokens_count": output_tokens, | ||
} | ||
|
||
def _create_prompt(self, prompt_tokens: int, start_index: int) -> str: | ||
def _create_prompt( | ||
self, prompt_tokens: int, start_index: int, request_id: int | ||
) -> str: | ||
""" | ||
Create a prompt with unique prefix to prevent vLLM prefix caching. | ||
Args: | ||
prompt_tokens: Target number of tokens for the prompt | ||
start_index: Starting position in the text corpus | ||
request_id: Unique identifier for this request (used as prefix) | ||
Returns: | ||
Generated prompt string with unique prefix | ||
""" | ||
if prompt_tokens <= 0: | ||
return "" | ||
return f"{request_id}: " | ||
|
||
unique_prefix = f"{request_id}: " | ||
|
||
# Calculate how many tokens the prefix uses | ||
prefix_tokens = len(self.processor.tokenize(unique_prefix)) | ||
|
||
# Adjust target tokens to account for the prefix | ||
remaining_tokens = max(1, prompt_tokens - prefix_tokens) | ||
Comment on lines
+178
to
+206
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like how prefix_iter = iter(t for t in self.processor.get_vocab())
...
unique_prefix = next(it) |
||
|
||
left = start_index | ||
right = start_index + 4 * prompt_tokens | ||
right = start_index + 4 * remaining_tokens | ||
|
||
while left < right: | ||
mid = (left + right) // 2 | ||
test_prompt = self.text_creator.create_text(start_index, mid - start_index) | ||
base_text = self.text_creator.create_text(start_index, mid - start_index) | ||
test_prompt = unique_prefix + base_text | ||
test_tokens = len(self.processor.tokenize(test_prompt)) | ||
|
||
if test_tokens == prompt_tokens: | ||
|
@@ -195,7 +221,8 @@ def _create_prompt(self, prompt_tokens: int, start_index: int) -> str: | |
else: | ||
right = mid | ||
|
||
return self.text_creator.create_text(start_index, left - start_index) | ||
base_text = self.text_creator.create_text(start_index, left - start_index) | ||
return unique_prefix + base_text | ||
|
||
|
||
class SyntheticDatasetCreator(DatasetCreator): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
prompt_tokens should never be less than 1. This is either redundant or there is an error in the sampling code.