Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@
## 仮想環境

```bash
python3 -m venv venv
source venv/bin/activate
pip install -r requrements.txt
uv sync
```

## データセット
Expand Down Expand Up @@ -66,15 +64,15 @@ MODEL_NAME=llm-jp/llm-jp-3-1.8b-instruct
OUTPUT_DIR=./output/llm-jp-3-1.8b-instruct

# 生成
python3 -m src.llm_jp_judge.generate \
uv run python3 -m src.llm_jp_judge.generate \
output.dir=$OUTPUT_DIR/generation \
client=vllm \
client.model_name=$MODEL_NAME \
benchmark.quality.dataset.path=./data/cache/llm-jp/llm-jp-instructions/v1.0/test.json \
benchmark.safety.dataset.path=./data/cache/llm-jp/AnswerCarefully/v2.0/test.json

# 評価
python3 -m src.llm_jp_judge.evaluate \
uv run python3 -m src.llm_jp_judge.evaluate \
input.dir=$OUTPUT_DIR/generation \
output.dir=$OUTPUT_DIR/evaluation \
client=azure \
Expand Down Expand Up @@ -137,7 +135,7 @@ python3 -m src.llm_jp_judge.evaluate \
非アクティブにしたい場合は、生成時に以下のように指定してください。

```
python3 -m src.llm_jp_judge.generate \
uv run python3 -m src.llm_jp_judge.generate \
benchmark.mt_bench.dataset.path=null
```

Expand All @@ -149,7 +147,7 @@ python3 -m src.llm_jp_judge.generate \
非アクティブにしたい場合は、生成時に以下のように指定してください。

```
python3 -m src.llm_jp_judge.generate \
uv run python3 -m src.llm_jp_judge.generate \
benchmark.mt_bench.dataset.path=null
```

Expand All @@ -162,7 +160,7 @@ python3 -m src.llm_jp_judge.generate \
OpenAI API のモデル名(例:`gpt-4o-2024-08-06`)を指定できます。

```
python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
uv run python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
client=openai \
client.model_name=gpt-4o-2024-08-06 \ # モデル名
client.async_request_interval=0.5 # APIリクエストの間隔(秒)
Expand All @@ -176,7 +174,7 @@ python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
Azure OpenAI APIのデプロイ名(例:`gpt-4o-2024-08-06`)を指定できます。

```
python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
uv run python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
client=azure \
client.model_name=gpt-4o-2024-08-06 \ # デプロイ名
client.async_request_interval=0.5 # APIリクエストの間隔(秒)
Expand All @@ -187,7 +185,7 @@ python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
AWS Bedrock APIのデプロイ名(例:`anthropic.claude-3-5-sonnet-20240620-v1:0`)を指定できます。

```
python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
uv run python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
client=bedrock \
client.model_name=anthropic.claude-3-5-sonnet-20240620-v1:0 \ # デプロイ名
client.async_request_interval=10 # APIリクエストの間隔(秒)
Expand All @@ -202,7 +200,7 @@ Hugging Faceのモデル名(例:`llm-jp/llm-jp-3-1.8b-instruct`)もしくはパ
> 対応していない場合、チャットテンプレートに対応したトークナイザーを`client.tokenizer_name`として指定するか、jinja形式のチャットテンプレートを`client.chat_template.path`として与えてください。

```bash
python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
uv run python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
client=vllm \
client.model_name=llm-jp/llm-jp-3-1.8b-instruct # Huggin Faceのモデル名 or パス
```
Expand All @@ -217,7 +215,7 @@ python3 -m src.llm_jp_judge.evaluate \ # generate or evaluate
`{entity_name}`、`{project_name}`、`{run_name}`は適宜設定してください。

```
python3 -m src.llm_jp_judge.evaluate \
uv run python3 -m src.llm_jp_judge.evaluate \
dashboard=wandb \
dashboard.entity={entity_name} \
dashboard.project={project_name} \
Expand Down
18 changes: 18 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[project]
name = "llm-jp-judge"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"anthropic==0.49.0",
"awscli==1.38.5",
"boto3==1.37.5",
"datasets>=4.1.1",
"hydra-core==1.3.2",
"openai==1.65.2",
"python-dotenv==1.0.1",
"transformers<4.54.0",
"vllm>=0.6.2",
"wandb==0.19.7",
]
11 changes: 6 additions & 5 deletions requrements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
anthropic==0.49.0
awscli==1.38.5
boto3==1.37.5
hydra-core==1.3.2
openai==1.65.2
vllm==0.6.2
anthropic==0.49.0
python-dotenv==1.0.1
wandb==0.19.7
awscli==1.38.5
boto3==1.37.5
transformers<4.54.0
vllm>=0.6.2
wandb==0.19.7
1 change: 0 additions & 1 deletion src/llm_jp_judge/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import hydra


from . import generate


Expand Down
2 changes: 1 addition & 1 deletion src/llm_jp_judge/client/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .remote import OpenAI, AzureOpenAI, BedrockAnthropic
from .local import vLLMClient
from .remote import AzureOpenAI, BedrockAnthropic, OpenAI


def load_client(name="azure", **kwargs):
Expand Down
3 changes: 1 addition & 2 deletions src/llm_jp_judge/client/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
from copy import deepcopy

import hydra

import torch
from vllm import LLM, SamplingParams
from huggingface_hub import repo_exists
from vllm import LLM, SamplingParams

NUM_GPUS = torch.cuda.device_count()

Expand Down
14 changes: 6 additions & 8 deletions src/llm_jp_judge/client/remote.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
import os
import asyncio
import logging
import os
import warnings
import asyncio

import openai
from openai import OpenAI as OpenAIClient
from openai import AzureOpenAI as AzureOpenAIClient
from anthropic import AnthropicBedrock as AnthropicBedrockClient

from dotenv import load_dotenv

import tqdm
import tqdm.asyncio
from anthropic import AnthropicBedrock as AnthropicBedrockClient
from dotenv import load_dotenv
from openai import AzureOpenAI as AzureOpenAIClient
from openai import OpenAI as OpenAIClient

from .local import BaseClient

Expand Down
2 changes: 1 addition & 1 deletion src/llm_jp_judge/dashboard/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .wandb import WandB
from .base import BaseDashboard
from .wandb import WandB


def load_dashboard(cfg, name=None, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion src/llm_jp_judge/dashboard/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import json
import os


class BaseDashboard:
Expand Down
3 changes: 1 addition & 2 deletions src/llm_jp_judge/dashboard/wandb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import wandb

import omegaconf
import wandb

from .base import BaseDashboard

Expand Down
4 changes: 2 additions & 2 deletions src/llm_jp_judge/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .safety import load_safety
from .quality import load_quality
from .mt_bench import load_mt_bench
from .quality import load_quality
from .safety import load_safety


def load_dataset(name, path, size=None):
Expand Down
1 change: 1 addition & 0 deletions src/llm_jp_judge/dataset/mt_bench.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json

import hydra


Expand Down
1 change: 1 addition & 0 deletions src/llm_jp_judge/dataset/quality.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json

import hydra


Expand Down
1 change: 1 addition & 0 deletions src/llm_jp_judge/dataset/safety.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json

import hydra


Expand Down
10 changes: 5 additions & 5 deletions src/llm_jp_judge/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import re
import os
import glob
import logging
import os
import re

import hydra
import logging

from .client import load_client
from .evaluator import load_evaluator
from .utils.data import load_jsonl, load_json
from .dashboard import load_dashboard
from .evaluator import load_evaluator
from .utils.data import load_json, load_jsonl


def load_metadata(cfg):
Expand Down
2 changes: 1 addition & 1 deletion src/llm_jp_judge/evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .mt_bench import MTBenchEvaluator
from .quality import QualityEvaluator
from .safety import SafetyEvaluator
from .mt_bench import MTBenchEvaluator


def load_evaluator(client, dashboard, metadata={}, metric="abs_quality", **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion src/llm_jp_judge/evaluator/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
import json
import logging
import re


class BaseScoreExtractor(object):
Expand Down
7 changes: 3 additions & 4 deletions src/llm_jp_judge/evaluator/mt_bench.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import logging

import json
from copy import deepcopy
import logging
from collections import defaultdict
from copy import deepcopy

from .base import BaseEvaluator, BaseScoreExtractor
from ..utils.data import load_jsonl
from .base import BaseEvaluator, BaseScoreExtractor


class MTBenchEvaluator(BaseEvaluator):
Expand Down
5 changes: 2 additions & 3 deletions src/llm_jp_judge/evaluator/quality.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import re
import json
import logging

from copy import deepcopy
import re
from collections import defaultdict
from copy import deepcopy

from .base import BaseEvaluator

Expand Down
5 changes: 2 additions & 3 deletions src/llm_jp_judge/evaluator/safety.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re
import logging

from copy import deepcopy
import re
from collections import defaultdict
from copy import deepcopy

from .base import BaseEvaluator, BaseScoreExtractor

Expand Down
8 changes: 3 additions & 5 deletions src/llm_jp_judge/generate.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import logging
import os
import hydra

from collections import defaultdict

import hydra
from omegaconf import OmegaConf

from .client import load_client
from .utils.data import save_jsonl, save_json
from .dataset import load_dataset

import logging
from .utils.data import save_json, save_jsonl


def generate(cfg, client, benchmark_cfg):
Expand Down
2 changes: 1 addition & 1 deletion src/llm_jp_judge/utils/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import json
import os

import hydra

Expand Down
Loading