Skip to content

Commit 7d613f5

Browse files
committed
misc: factor subtitle similarity helpers
1 parent 85eb7a0 commit 7d613f5

File tree

6 files changed

+150
-83
lines changed

6 files changed

+150
-83
lines changed

meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ sources = files(
142142
'misc/json.c',
143143
'misc/language.c',
144144
'misc/natural_sort.c',
145+
'misc/string_similarity.c',
145146
'misc/node.c',
146147
'misc/path_utils.c',
147148
'misc/random.c',

misc/language.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "common/common.h"
2424
#include "misc/ctype.h"
25+
#include <string.h>
2526

2627
#define L(s) { #s, sizeof(#s) - 1 }
2728

@@ -297,6 +298,27 @@ int mp_match_lang(char **langs, const char *lang)
297298
return best_score;
298299
}
299300

301+
bool mp_language_is_suffix_token(const char *token)
302+
{
303+
if (!token || !token[0])
304+
return false;
305+
306+
char *lang_list[] = {(char *)token, NULL};
307+
if (mp_match_lang(lang_list, token) > 0)
308+
return true;
309+
310+
static const char *const special[] = {
311+
"jp", "chs", "cht", "sub", "subs", "sdh", "forced", "cc", NULL,
312+
};
313+
314+
for (int i = 0; special[i]; i++) {
315+
if (strcmp(token, special[i]) == 0)
316+
return true;
317+
}
318+
319+
return false;
320+
}
321+
300322
bstr mp_guess_lang_from_filename(bstr name, int *lang_start, enum track_flags *flags)
301323
{
302324
name = bstr_strip(bstr_strip_ext(name));

misc/language.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,15 @@
2020
#ifndef MP_LANGUAGE_H
2121
#define MP_LANGUAGE_H
2222

23+
#include <stdbool.h>
24+
2325
#include "misc/bstr.h"
2426
#include "common/common.h"
2527

2628
// Result numerically higher => better match. 0 == no match.
2729
int mp_match_lang(char **langs, const char *lang);
2830
char **mp_get_user_langs(void);
2931
bstr mp_guess_lang_from_filename(bstr name, int *lang_start, enum track_flags *flags);
32+
bool mp_language_is_suffix_token(const char *token);
3033

3134
#endif /* MP_LANGUAGE_H */

misc/string_similarity.c

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* This file is part of mpv.
3+
*
4+
* mpv is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU Lesser General Public
6+
* License as published by the Free Software Foundation; either
7+
* version 2.1 of the License, or (at your option) any later version.
8+
*
9+
* mpv is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU Lesser General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU Lesser General Public
15+
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
16+
*/
17+
18+
#include <ctype.h>
19+
#include <string.h>
20+
21+
#include "mpv_talloc.h"
22+
23+
#include "common/common.h"
24+
#include "misc/bstr.h"
25+
#include "misc/language.h"
26+
#include "misc/path_utils.h"
27+
#include "misc/string_similarity.h"
28+
29+
static bool is_suffix_token(const char *tkn)
30+
{
31+
return mp_language_is_suffix_token(tkn);
32+
}
33+
34+
char *mp_normalize_base_name(void *ta_ctx, const char *path)
35+
{
36+
struct bstr base = bstr0(mp_basename(path));
37+
base = bstr_strip_ext(base);
38+
char *tmpbuf = talloc_strndup(ta_ctx, base.start, base.len);
39+
for (int i = 0; tmpbuf[i]; i++)
40+
tmpbuf[i] = tolower((unsigned char)tmpbuf[i]);
41+
char **tokens = NULL;
42+
int ntok = 0;
43+
char *p = tmpbuf;
44+
while (*p) {
45+
while (*p && !isalnum((unsigned char)*p)) p++;
46+
if (!*p) break;
47+
char *start = p;
48+
while (*p && isalnum((unsigned char)*p)) p++;
49+
char save = *p; *p = '\0';
50+
MP_TARRAY_APPEND(ta_ctx, tokens, ntok, talloc_strdup(ta_ctx, start));
51+
*p = save;
52+
}
53+
while (ntok > 0 && is_suffix_token(tokens[ntok - 1]))
54+
ntok--;
55+
char *out = talloc_strdup(ta_ctx, "");
56+
for (int i = 0; i < ntok; i++)
57+
out = talloc_asprintf_append_buffer(out, "%s", tokens[i]);
58+
if (!out[0])
59+
out = talloc_strdup(ta_ctx, tmpbuf);
60+
return out;
61+
}
62+
63+
int mp_levenshtein_dist(const char *a, const char *b)
64+
{
65+
int la = (int)strlen(a), lb = (int)strlen(b);
66+
if (la == 0) return lb;
67+
if (lb == 0) return la;
68+
int *prev = talloc_array(NULL, int, lb + 1);
69+
int *curr = talloc_array(NULL, int, lb + 1);
70+
for (int j = 0; j <= lb; j++) prev[j] = j;
71+
for (int i = 1; i <= la; i++) {
72+
curr[0] = i;
73+
for (int j = 1; j <= lb; j++) {
74+
int cost = a[i - 1] == b[j - 1] ? 0 : 1;
75+
int del = prev[j] + 1;
76+
int ins = curr[j - 1] + 1;
77+
int sub = prev[j - 1] + cost;
78+
int m = del < ins ? del : ins;
79+
curr[j] = m < sub ? m : sub;
80+
}
81+
int *tmpv = prev; prev = curr; curr = tmpv;
82+
}
83+
int d = prev[lb];
84+
talloc_free(prev);
85+
talloc_free(curr);
86+
return d;
87+
}
88+
89+
double mp_similarity_ratio(const char *a, const char *b)
90+
{
91+
int la = (int)strlen(a), lb = (int)strlen(b);
92+
int m = la > lb ? la : lb;
93+
if (m == 0) return 1.0;
94+
int d = mp_levenshtein_dist(a, b);
95+
return 1.0 - (double)d / (double)m;
96+
}

misc/string_similarity.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#pragma once
2+
3+
/*
4+
* This file is part of mpv.
5+
*
6+
* mpv is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 2.1 of the License, or (at your option) any later version.
10+
*
11+
* mpv is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public
17+
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
20+
#include <stddef.h>
21+
22+
char *mp_normalize_base_name(void *ta_ctx, const char *path);
23+
double mp_similarity_ratio(const char *a, const char *b);
24+
int mp_levenshtein_dist(const char *a, const char *b);

player/loadfile.c

Lines changed: 4 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include "misc/json.h"
5252
#include "misc/language.h"
5353
#include "misc/bstr.h"
54+
#include "misc/string_similarity.h"
5455

5556
#include "audio/out/ao.h"
5657
#include "filters/f_decoder_wrapper.h"
@@ -942,86 +943,6 @@ int mp_add_external_file(struct MPContext *mpctx, char *filename,
942943
return -1;
943944
}
944945

945-
// Helpers for sub-auto=closest selection
946-
static bool is_suffix_token(const char *tkn)
947-
{
948-
int len = (int)strlen(tkn);
949-
bool alpha = true;
950-
for (int i = 0; i < len; i++)
951-
alpha &= isalpha((unsigned char)tkn[i]) != 0;
952-
if (alpha && (len == 2 || len == 3))
953-
return true;
954-
const char *special[] = {"eng","en","es","fr","de","pt","ru","jp","ja","zh","chs","cht","sub","subs","sdh","forced","cc",NULL};
955-
for (int i = 0; special[i]; i++)
956-
if (strcmp(tkn, special[i]) == 0)
957-
return true;
958-
return false;
959-
}
960-
961-
static char *normalize_base_name(void *ta_ctx, const char *path)
962-
{
963-
struct bstr base = bstr0(mp_basename(path));
964-
base = bstr_strip_ext(base);
965-
char *tmpbuf = talloc_strndup(ta_ctx, base.start, base.len);
966-
for (int i = 0; tmpbuf[i]; i++)
967-
tmpbuf[i] = tolower((unsigned char)tmpbuf[i]);
968-
char **tokens = NULL;
969-
int ntok = 0;
970-
char *p = tmpbuf;
971-
while (*p) {
972-
while (*p && !isalnum((unsigned char)*p)) p++;
973-
if (!*p) break;
974-
char *start = p;
975-
while (*p && isalnum((unsigned char)*p)) p++;
976-
char save = *p; *p = '\0';
977-
MP_TARRAY_APPEND(ta_ctx, tokens, ntok, talloc_strdup(ta_ctx, start));
978-
*p = save;
979-
}
980-
while (ntok > 0 && is_suffix_token(tokens[ntok - 1]))
981-
ntok--;
982-
char *out = talloc_strdup(ta_ctx, "");
983-
for (int i = 0; i < ntok; i++)
984-
out = talloc_asprintf_append_buffer(out, "%s", tokens[i]);
985-
if (!out[0])
986-
out = talloc_strdup(ta_ctx, tmpbuf);
987-
return out;
988-
}
989-
990-
static int levenshtein_dist(const char *a, const char *b)
991-
{
992-
int la = (int)strlen(a), lb = (int)strlen(b);
993-
if (la == 0) return lb;
994-
if (lb == 0) return la;
995-
int *prev = talloc_array(NULL, int, lb + 1);
996-
int *curr = talloc_array(NULL, int, lb + 1);
997-
for (int j = 0; j <= lb; j++) prev[j] = j;
998-
for (int i = 1; i <= la; i++) {
999-
curr[0] = i;
1000-
for (int j = 1; j <= lb; j++) {
1001-
int cost = a[i - 1] == b[j - 1] ? 0 : 1;
1002-
int del = prev[j] + 1;
1003-
int ins = curr[j - 1] + 1;
1004-
int sub = prev[j - 1] + cost;
1005-
int m = del < ins ? del : ins;
1006-
curr[j] = m < sub ? m : sub;
1007-
}
1008-
int *tmpv = prev; prev = curr; curr = tmpv;
1009-
}
1010-
int d = prev[lb];
1011-
talloc_free(prev);
1012-
talloc_free(curr);
1013-
return d;
1014-
}
1015-
1016-
static double similarity_ratio(const char *a, const char *b)
1017-
{
1018-
int la = (int)strlen(a), lb = (int)strlen(b);
1019-
int m = la > lb ? la : lb;
1020-
if (m == 0) return 1.0;
1021-
int d = levenshtein_dist(a, b);
1022-
return 1.0 - (double)d / (double)m;
1023-
}
1024-
1025946
// Returns true if a season/episode could be parsed.
1026947
// Recognizes common patterns like S01E02, s1e2, or 1x02 (case-insensitive).
1027948
static bool parse_season_episode(const char *path, int *out_season, int *out_episode)
@@ -1141,7 +1062,7 @@ void autoload_external_files(struct MPContext *mpctx, struct mp_cancel *cancel)
11411062
int best_sub_index = -1;
11421063
if (opts->sub_auto == 3) {
11431064
void *selctx = talloc_new(tmp);
1144-
char *movie_norm = normalize_base_name(selctx, mpctx->filename);
1065+
char *movie_norm = mp_normalize_base_name(selctx, mpctx->filename);
11451066
int mv_season = -1, mv_episode = -1;
11461067
bool mv_has_se = parse_season_episode(mpctx->filename, &mv_season, &mv_episode);
11471068
double best_score = -1.0;
@@ -1162,8 +1083,8 @@ void autoload_external_files(struct MPContext *mpctx, struct mp_cancel *cancel)
11621083
continue;
11631084
if (!sc[STREAM_VIDEO] && !sc[STREAM_AUDIO])
11641085
continue;
1165-
char *cand_norm = normalize_base_name(selctx, e->fname);
1166-
double score = similarity_ratio(movie_norm, cand_norm);
1086+
char *cand_norm = mp_normalize_base_name(selctx, e->fname);
1087+
double score = mp_similarity_ratio(movie_norm, cand_norm);
11671088
int cand_season = -1, cand_episode = -1;
11681089
int match_se = 0;
11691090
if (mv_has_se && parse_season_episode(e->fname, &cand_season, &cand_episode)) {

0 commit comments

Comments
 (0)