|
8 | 8 | "\n", |
9 | 9 | "WARNING: This feature is new and extremely EXPERIMENTAL. Unlike almost everything else in DSPy, it's currently in pure proof of concept and development mode, but we release it to encourage community involvement.\n", |
10 | 10 | "\n", |
11 | | - "For this tutorial, you will also need DSPy's Arbor RL server.\n", |
| 11 | + "For this tutorial, you will also need [DSPy's Arbor RL framework](https://github.com/Ziems/arbor) which you can install with:\n", |
12 | 12 | "\n", |
13 | 13 | "```bash\n", |
14 | 14 | "> pip install -U arbor-ai\n", |
|
22 | 22 | "outputs": [], |
23 | 23 | "source": [ |
24 | 24 | "import dspy\n", |
25 | | - "from dspy.clients.lm_local_arbor import ArborProvider\n", |
26 | | - "\n", |
27 | 25 | "import arbor\n", |
| 26 | + "from arbor import ArborGRPO, ArborProvider\n", |
28 | 27 | "arbor_server_info = arbor.init() # Initialize the Arbor server in the background\n", |
29 | 28 | "\n", |
30 | 29 | "port = 7453\n", |
31 | | - "local_lm_name = \"Qwen/Qwen2.5-7B-Instruct\"\n", |
| 30 | + "local_lm_name = \"Qwen/Qwen2.5-1.5B-Instruct\"\n", |
32 | 31 | "local_lm = dspy.LM(\n", |
33 | 32 | " model=f\"openai/arbor:{local_lm_name}\",\n", |
34 | 33 | " provider=ArborProvider(),\n", |
35 | | - " temperature=0.7,\n", |
36 | | - " api_base=arbor_server_info[\"api_base\"],\n", |
| 34 | + " api_base=arbor_server_info[\"base_url\"],\n", |
| 35 | + " # Arbor checks to make sure these match the training config\n", |
| 36 | + " temperature=1.0,\n", |
| 37 | + " top_p=1.0,\n", |
| 38 | + " top_k=-1,\n", |
| 39 | + " repetition_penalty=1.0,\n", |
| 40 | + " max_tokens=2048,\n", |
37 | 41 | ")\n", |
38 | 42 | "\n", |
39 | | - "dspy.configure(lm=local_lm)\n", |
40 | | - "\n", |
41 | | - "openai_lm = dspy.LM(model=\"openai/gpt-4.1-mini\")" |
| 43 | + "dspy.configure(lm=local_lm)" |
42 | 44 | ] |
43 | 45 | }, |
44 | 46 | { |
|
97 | 99 | "source": [ |
98 | 100 | "### Load the HoVer dataset.\n", |
99 | 101 | "\n", |
100 | | - "Let's load a dataset for our task. We'll load examples from the HoVer multi-hop task, where the input is a (really!) complex claim and the output we're seeking is the set of Wikipedia pages that are required to fact-check that claim." |
| 102 | + "Let's load a dataset for our task. We'll load examples from the HoVer multi-hop task, where the input is a (really!) complex claim and the output we're seeking is the set of Wikipedia pages that are required to fact-check that claim.\n", |
| 103 | + "\n", |
| 104 | + "You may have to install an older version of the dataset to get it working properly...\n", |
| 105 | + "```shell\n", |
| 106 | + "> pip install datasets==3.6.0\n", |
| 107 | + "```" |
101 | 108 | ] |
102 | 109 | }, |
103 | 110 | { |
|
226 | 233 | "metadata": {}, |
227 | 234 | "outputs": [], |
228 | 235 | "source": [ |
229 | | - "from dspy.teleprompt.grpo import GRPO\n", |
230 | | - "\n", |
231 | 236 | "program = ResearchHop(num_docs=4, num_hops=2)\n", |
232 | 237 | "program.set_lm(local_lm)\n", |
233 | 238 | "\n", |
234 | | - "# NOTE: Training on 6 GPUs.\n", |
| 239 | + "# NOTE: Training on 4 GPUs.\n", |
235 | 240 | "train_kwargs = {\n", |
236 | 241 | " \"per_device_train_batch_size\": 2,\n", |
237 | | - " \"gradient_accumulation_steps\": 8,\n", |
| 242 | + " \"gradient_accumulation_steps\": 24/6,\n", |
238 | 243 | " \"temperature\": 1.0,\n", |
239 | | - " \"beta\": 0.04,\n", |
240 | | - " \"learning_rate\": 1e-5,\n", |
| 244 | + " \"top_k\": -1,\n", |
| 245 | + " \"top_p\": 1.0,\n", |
| 246 | + " \"repetition_penalty\": 1.0,\n", |
| 247 | + " \"beta\": 0.00,\n", |
| 248 | + " \"learning_rate\": 1e-6,\n", |
241 | 249 | " \"gradient_checkpointing\": True,\n", |
242 | | - " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n", |
243 | 250 | " \"bf16\": True,\n", |
244 | 251 | " \"lr_scheduler_type\": \"constant_with_warmup\",\n", |
| 252 | + " \"loss_type\": \"dapo\",\n", |
| 253 | + " \"max_steps\": 1000,\n", |
| 254 | + " \"report_to\": \"wandb\",\n", |
| 255 | + " \"log_completions\": True,\n", |
| 256 | + " \"logging_steps\": 1,\n", |
245 | 257 | " \"max_prompt_length\": None,\n", |
246 | 258 | " \"max_completion_length\": None,\n", |
247 | | - " \"scale_rewards\": True,\n", |
248 | | - " \"max_grad_norm\": 0.5,\n", |
249 | | - " \"lora\": True,\n", |
| 259 | + " \"scale_rewards\": False,\n", |
| 260 | + " \"max_grad_norm\": 1.0,\n", |
| 261 | + " \"lora_config\": {\n", |
| 262 | + " \"lora_alpha\": 16,\n", |
| 263 | + " \"lora_dropout\": 0.05,\n", |
| 264 | + " \"r\": 8,\n", |
| 265 | + " \"target_modules\": [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"up_proj\", \"down_proj\", \"gate_proj\"],\n", |
| 266 | + " },\n", |
| 267 | + " \"num_training_gpus\": 3,\n", |
| 268 | + " \"num_inference_gpus\": 1,\n", |
| 269 | + " \"weight_decay\": 0.001,\n", |
250 | 270 | "}\n", |
251 | 271 | "\n", |
252 | | - "compiler = GRPO(\n", |
| 272 | + "compiler = ArborGRPO(\n", |
253 | 273 | " metric=recall,\n", |
254 | 274 | " num_dspy_examples_per_grpo_step=6,\n", |
255 | | - " num_rollouts_per_grpo_step=4,\n", |
| 275 | + " num_rollouts_per_grpo_step=24,\n", |
256 | 276 | " exclude_demos=True,\n", |
257 | | - " num_train_steps=100,\n", |
| 277 | + " num_train_steps=1000,\n", |
258 | 278 | " num_threads=16,\n", |
259 | 279 | " use_train_as_val=False,\n", |
260 | | - " num_steps_for_val=10,\n", |
| 280 | + " num_steps_for_val=50,\n", |
261 | 281 | " train_kwargs=train_kwargs,\n", |
262 | | - " report_train_scores=False,\n", |
| 282 | + " checkpoint=\"single-best\",\n", |
263 | 283 | ")\n", |
264 | 284 | "\n", |
265 | 285 | "optimized_program = compiler.compile(\n", |
266 | 286 | " student=program,\n", |
267 | 287 | " trainset=trainset,\n", |
268 | 288 | " valset=devset,\n", |
269 | | - ")\n" |
| 289 | + ")\n", |
| 290 | + "\n" |
270 | 291 | ] |
271 | 292 | }, |
272 | 293 | { |
|
290 | 311 | "cell_type": "markdown", |
291 | 312 | "metadata": {}, |
292 | 313 | "source": [ |
293 | | - "In our preliminary experiments, training above for about 18 hours boosts the recall (devset) from 61.8% to 66.2%. This is _typically_ worse on cost/quality basis than you'd get from running prompt optimizers dspy.MIPROv2 or dspy.SIMBA, but it's still a very solid start for online RL over arbitrary LM programs for small LMs." |
| 314 | + "In our preliminary experiments, training about 18 hours boosts the recall (devset) from 61.8% to 66.2%. This is _typically_ worse on cost/quality basis than you'd get from running prompt optimizers dspy.MIPROv2 or dspy.SIMBA, but it's still a very solid start for online RL over arbitrary LM programs for small LMs." |
294 | 315 | ] |
295 | 316 | } |
296 | 317 | ], |
297 | 318 | "metadata": { |
298 | 319 | "kernelspec": { |
299 | | - "display_name": "jun2024_py310", |
| 320 | + "display_name": "arbor-exps", |
300 | 321 | "language": "python", |
301 | 322 | "name": "python3" |
302 | 323 | }, |
|
310 | 331 | "name": "python", |
311 | 332 | "nbconvert_exporter": "python", |
312 | 333 | "pygments_lexer": "ipython3", |
313 | | - "version": "3.10.14" |
| 334 | + "version": "3.11.13" |
314 | 335 | } |
315 | 336 | }, |
316 | 337 | "nbformat": 4, |
|
0 commit comments