Skip to content

Commit cb74166

Browse files
adds fine tuning notebook and sample datasets (#60)
* adds initial finetuning notebook * adds finetuning notebook and sample datasets * moves sample data to S3 bucket * adds link to contrastive loss paper. removes some plots * updates finetuning recipe to be more general * update github actions workflow and update gitignore file * test nighly run * rm nightly run * includes matplotlib as a dependency * adds interactive cosine similarity threshold selector --------- Co-authored-by: Tyler Hutcherson <[email protected]>
1 parent 47e36e2 commit cb74166

File tree

5 files changed

+992
-26
lines changed

5 files changed

+992
-26
lines changed

.github/workflows/nightly-test.yml

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,23 @@ jobs:
1616
runs-on: ubuntu-latest
1717
outputs:
1818
notebooks: ${{ steps.get_nbs.outputs.notebooks }}
19+
has_notebooks: ${{ steps.get_nbs.outputs.has_notebooks }}
1920
steps:
20-
- uses: actions/checkout@v2
21+
- uses: actions/checkout@v3
2122

2223
- id: get_nbs
2324
run: |
24-
# 1) Read ignore patterns from .github/ignore-notebooks.txt
25+
# 1) Find all available notebooks
26+
NBS=$(find python-recipes -name '*.ipynb')
27+
28+
# 2) Load notebooks to ignore
2529
IGNORE_LIST=()
2630
while IFS= read -r skip_nb || [ -n "$skip_nb" ]; do
2731
# Skip empty lines or comment lines
2832
[[ -z "$skip_nb" || "$skip_nb" =~ ^# ]] && continue
2933
IGNORE_LIST+=("$skip_nb")
3034
done < .github/ignore-notebooks.txt
3135
32-
# 2) Find all .ipynb in python-recipes (or your path)
33-
NBS=$(find python-recipes -name '*.ipynb')
34-
3536
# 3) Filter out notebooks that match anything in IGNORE_LIST
3637
FILTERED_NBS=()
3738
for nb in $NBS; do
@@ -42,29 +43,36 @@ jobs:
4243
break
4344
fi
4445
done
45-
4646
if [ "$skip" = false ]; then
4747
FILTERED_NBS+=("$nb")
4848
fi
4949
done
5050

51-
# 4) Convert the final array to compact JSON for GitHub Actions
51+
# 4) Stuff into a single-line JSON array
5252
NB_JSON=$(printf '%s\n' "${FILTERED_NBS[@]}" \
5353
| jq -R . \
5454
| jq -s -c .)
5555

56-
# 5) Default to an empty array if there's nothing left
5756
if [ -z "$NB_JSON" ] || [ "$NB_JSON" = "[]" ]; then
5857
NB_JSON="[]"
5958
fi
6059

6160
echo "All valid notebooks: $NB_JSON"
61+
62+
# 5) Check if there's anything in FILTERED_NBS
63+
if [ "${#FILTERED_NBS[@]}" -gt 0 ]; then
64+
echo "has_notebooks=true" >> $GITHUB_OUTPUT
65+
else
66+
echo "has_notebooks=false" >> $GITHUB_OUTPUT
67+
fi
68+
6269
echo "notebooks=$NB_JSON" >> $GITHUB_OUTPUT
6370

6471
# ---------------------------------------------------------
6572
# 2) Test all notebooks in parallel
6673
# ---------------------------------------------------------
6774
test_all_notebooks:
75+
if: ${{ needs.gather_all_notebooks.outputs.has_notebooks == 'true' }}
6876
needs: gather_all_notebooks
6977
runs-on: ubuntu-latest
7078
strategy:
@@ -79,7 +87,7 @@ jobs:
7987
- 6379:6379
8088

8189
steps:
82-
- uses: actions/checkout@v2
90+
- uses: actions/checkout@v3
8391

8492
# Setup Python
8593
- uses: actions/setup-python@v4

.github/workflows/test.yml

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,29 @@ jobs:
1717
runs-on: ubuntu-latest
1818
outputs:
1919
notebooks: ${{ steps.get_nbs.outputs.notebooks }}
20+
has_notebooks: ${{ steps.get_nbs.outputs.has_notebooks }}
2021
steps:
21-
- uses: actions/checkout@v2
22+
- uses: actions/checkout@v3
2223

23-
- id: get_nbs
24+
- name: Gather notebooks
25+
id: get_nbs
2426
run: |
25-
# Compare this commit/PR to 'main' and list changed .ipynb files
27+
# 1) Compare this commit/PR to 'main' and list changed notebooks
2628
git fetch --depth=1 origin main
2729
CHANGED_NOTEBOOKS=$(git diff --name-only origin/main | grep '\.ipynb$' || true)
2830
29-
# 1) Read ignore patterns from .github/ignore-notebooks.txt
31+
# 2) Load notebooks to ignore
3032
IGNORE_LIST=()
3133
while IFS= read -r skip_nb || [ -n "$skip_nb" ]; do
3234
# Skip empty lines or comment lines
3335
[[ -z "$skip_nb" || "$skip_nb" =~ ^# ]] && continue
3436
IGNORE_LIST+=("$skip_nb")
3537
done < .github/ignore-notebooks.txt
3638

37-
# 2) Filter out notebooks in CHANGED_NOTEBOOKS that match ignore patterns
39+
# 3) Filter out ignored notebooks
3840
FILTERED_NBS=()
3941
for nb in $CHANGED_NOTEBOOKS; do
4042
skip=false
41-
4243
# Check if in ignore list
4344
for ignore_nb in "${IGNORE_LIST[@]}"; do
4445
# Partial match:
@@ -47,33 +48,31 @@ jobs:
4748
break
4849
fi
4950
done
50-
5151
if [ "$skip" = false ]; then
5252
FILTERED_NBS+=("$nb")
5353
fi
5454
done
5555

56-
# 3) Build a single-line JSON array
56+
# 4) Stuff into a single-line JSON array
5757
NB_JSON=$(printf '%s\n' "${FILTERED_NBS[@]}" \
5858
| jq -R . \
5959
| jq -s -c .)
6060

61-
# 4) Fallback to an empty array if there's nothing left
6261
if [ -z "$NB_JSON" ] || [ "$NB_JSON" = "[]" ]; then
6362
NB_JSON="[]"
6463
fi
6564

6665
echo "All valid notebooks: $NB_JSON"
6766

68-
# 5) Write to $GITHUB_OUTPUT
69-
if [ "$NB_JSON" != "[]" ]; then
67+
# 5) Check if there's anything in FILTERED_NBS
68+
if [ "${#FILTERED_NBS[@]}" -gt 0 ]; then
7069
echo "has_notebooks=true" >> $GITHUB_OUTPUT
7170
else
7271
echo "has_notebooks=false" >> $GITHUB_OUTPUT
7372
fi
7473

7574
echo "notebooks=$NB_JSON" >> $GITHUB_OUTPUT
76-
75+
7776
# ---------------------------------------------------------
7877
# 2) Test each changed notebook in parallel
7978
# ---------------------------------------------------------
@@ -93,7 +92,7 @@ jobs:
9392
- 6379:6379
9493

9594
steps:
96-
- uses: actions/checkout@v2
95+
- uses: actions/checkout@v3
9796

9897
# Setup Python
9998
- uses: actions/setup-python@v4

.gitignore

Lines changed: 221 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,222 @@
1-
.env
2-
node_modules/
1+
# Created by https://www.toptal.com/developers/gitignore/api/python,venv,macos
2+
# Edit at https://www.toptal.com/developers/gitignore?templates=python,venv,macos
3+
4+
### macOS ###
5+
# General
36
.DS_Store
4-
.pytest_cache/
7+
.AppleDouble
8+
.LSOverride
9+
10+
# Icon must end with two \r
11+
Icon
12+
13+
14+
# Thumbnails
15+
._*
16+
17+
# Files that might appear in the root of a volume
18+
.DocumentRevisions-V100
19+
.fseventsd
20+
.Spotlight-V100
21+
.TemporaryItems
22+
.Trashes
23+
.VolumeIcon.icns
24+
.com.apple.timemachine.donotpresent
25+
26+
# Directories potentially created on remote AFP share
27+
.AppleDB
28+
.AppleDesktop
29+
Network Trash Folder
30+
Temporary Items
31+
.apdisk
32+
33+
### macOS Patch ###
34+
# iCloud generated files
35+
*.icloud
36+
37+
### Python ###
38+
# Byte-compiled / optimized / DLL files
39+
__pycache__/
40+
*.py[cod]
41+
*$py.class
42+
43+
# C extensions
44+
*.so
45+
46+
# Distribution / packaging
47+
.Python
48+
build/
49+
develop-eggs/
50+
dist/
51+
downloads/
52+
eggs/
53+
.eggs/
54+
lib/
55+
lib64/
56+
parts/
57+
sdist/
58+
var/
59+
wheels/
60+
share/python-wheels/
61+
*.egg-info/
62+
.installed.cfg
63+
*.egg
64+
MANIFEST
65+
66+
# PyInstaller
67+
# Usually these files are written by a python script from a template
68+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
69+
*.manifest
70+
*.spec
71+
72+
# Installer logs
73+
pip-log.txt
74+
pip-delete-this-directory.txt
75+
76+
# Unit test / coverage reports
77+
htmlcov/
78+
.tox/
79+
.nox/
80+
.coverage
81+
.coverage.*
82+
.cache
83+
nosetests.xml
84+
coverage.xml
85+
*.cover
86+
*.py,cover
87+
.hypothesis/
88+
.pytest_cache/
89+
cover/
90+
91+
# Translations
92+
*.mo
93+
*.pot
94+
95+
# Django stuff:
96+
*.log
97+
local_settings.py
98+
db.sqlite3
99+
db.sqlite3-journal
100+
101+
# Flask stuff:
102+
instance/
103+
.webassets-cache
104+
105+
# Scrapy stuff:
106+
.scrapy
107+
108+
# Sphinx documentation
109+
docs/_build/
110+
111+
# PyBuilder
112+
.pybuilder/
113+
target/
114+
115+
# Jupyter Notebook
116+
.ipynb_checkpoints
117+
118+
# IPython
119+
profile_default/
120+
ipython_config.py
121+
122+
# pyenv
123+
# For a library or package, you might want to ignore these files since the code is
124+
# intended to run in multiple environments; otherwise, check them in:
125+
.python-version
126+
127+
# pipenv
128+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
129+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
130+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
131+
# install all needed dependencies.
132+
#Pipfile.lock
133+
134+
# poetry
135+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
136+
# This is especially recommended for binary packages to ensure reproducibility, and is more
137+
# commonly ignored for libraries.
138+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
139+
#poetry.lock
140+
141+
# pdm
142+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
143+
#pdm.lock
144+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
145+
# in version control.
146+
# https://pdm.fming.dev/#use-with-ide
147+
.pdm.toml
148+
149+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
150+
__pypackages__/
151+
152+
# Celery stuff
153+
celerybeat-schedule
154+
celerybeat.pid
155+
156+
# SageMath parsed files
157+
*.sage.py
158+
159+
# Environments
160+
.env
161+
.venv
162+
env/
163+
venv/
164+
ENV/
165+
env.bak/
166+
venv.bak/
167+
168+
# Spyder project settings
169+
.spyderproject
170+
.spyproject
171+
172+
# Rope project settings
173+
.ropeproject
174+
175+
# mkdocs documentation
176+
/site
177+
178+
# mypy
179+
.mypy_cache/
180+
.dmypy.json
181+
dmypy.json
182+
183+
# Pyre type checker
184+
.pyre/
185+
186+
# pytype static type analyzer
187+
.pytype/
188+
189+
# Cython debug symbols
190+
cython_debug/
191+
192+
# PyCharm
193+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
194+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
195+
# and can be added to the global gitignore or merged into this file. For a more nuclear
196+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
197+
#.idea/
198+
199+
### Python Patch ###
200+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
201+
poetry.toml
202+
203+
# ruff
204+
.ruff_cache/
205+
206+
# LSP config files
207+
pyrightconfig.json
208+
209+
### venv ###
210+
# Virtualenv
211+
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
212+
[Bb]in
213+
[Ii]nclude
214+
[Ll]ib
215+
[Ll]ib64
216+
[Ll]ocal
217+
pyvenv.cfg
218+
pip-selfcheck.json
219+
220+
libs/redis/docs/.Trash*
221+
.python-version
222+
.idea/*

.python-version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.11.9
1+
3.11

0 commit comments

Comments
 (0)