diff --git a/.gitignore b/.gitignore index ce1f41f..5f2ad47 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ *.bin *.zip .idea/ -.txt \ No newline at end of file +.txt +*.pyc +best_bst_20480 \ No newline at end of file diff --git a/__pycache__/model_generate.cpython-311.pyc b/__pycache__/model_generate.cpython-311.pyc new file mode 100644 index 0000000..1920607 Binary files /dev/null and b/__pycache__/model_generate.cpython-311.pyc differ diff --git a/ada-bf/__init__.py b/ada-bf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ada-bf/__pycache__/ada_bf.cpython-311.pyc b/ada-bf/__pycache__/ada_bf.cpython-311.pyc new file mode 100644 index 0000000..71820db Binary files /dev/null and b/ada-bf/__pycache__/ada_bf.cpython-311.pyc differ diff --git a/ada-bf/__pycache__/bf.cpython-311.pyc b/ada-bf/__pycache__/bf.cpython-311.pyc new file mode 100644 index 0000000..e3a632a Binary files /dev/null and b/ada-bf/__pycache__/bf.cpython-311.pyc differ diff --git a/ada-bf/__pycache__/disjoint_ada_bf.cpython-311.pyc b/ada-bf/__pycache__/disjoint_ada_bf.cpython-311.pyc new file mode 100644 index 0000000..2c84962 Binary files /dev/null and b/ada-bf/__pycache__/disjoint_ada_bf.cpython-311.pyc differ diff --git a/ada-bf/ada_bf.py b/ada-bf/ada_bf.py new file mode 100644 index 0000000..413b487 --- /dev/null +++ b/ada-bf/ada_bf.py @@ -0,0 +1,214 @@ +import numpy as np +import pandas as pd + +from bf import hashfunc + +# parser = argparse.ArgumentParser() +# parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, +# help="path of the dataset") +# parser.add_argument('--num_group_min', action="store", dest="min_group", type=int, required=True, +# help="Minimum number of groups") +# parser.add_argument('--num_group_max', action="store", dest="max_group", type=int, required=True, +# help="Maximum number of groups") +# parser.add_argument('--size_of_Ada_BF', action="store", dest="R_sum", type=int, required=True, +# help="size of the Ada-BF") +# parser.add_argument('--c_min', action="store", dest="c_min", type=float, required=True, +# help="minimum ratio of the keys") +# parser.add_argument('--c_max', action="store", dest="c_max", type=float, required=True, +# help="maximum ratio of the keys") +# +# results = parser.parse_args() +# DATA_PATH = results.data_path +# num_group_min = results.min_group +# num_group_max = results.max_group +# R_sum = results.R_sum +# c_min = results.c_min +# c_max = results.c_max + +# DATA_PATH = './URL_data.csv' +# num_group_min = 8 +# num_group_max = 12 +# R_sum = 200000 +# c_min = 1.8 +# c_max = 2.1 + + +''' +Load the data and select training data +''' +# data = pd.read_csv(DATA_PATH) +# negative_sample = data.loc[(data['label'] == 0)] +# positive_sample = data.loc[(data['label'] == 1)] +# train_negative = negative_sample.sample(frac=0.3) + +''' +Plot the distribution of scores +''' +# plt.style.use('seaborn-deep') +# +# x = data.loc[data['label'] == 1, 'score'] +# y = data.loc[data['label'] == 0, 'score'] +# bins = np.linspace(0, 1, 25) +# +# plt.hist([x, y], bins, log=True, label=['Keys', 'non-Keys']) +# plt.legend(loc='upper right') +# plt.savefig('./Score_Dist.png') +# plt.show() + + +class Ada_BloomFilter: + def __init__(self, n, hash_len, k_max): + self.n = n + self.hash_len = int(hash_len) + self.h = [] + for i in range(int(k_max)): + self.h.append(hashfunc(self.hash_len)) + self.table = np.zeros(self.hash_len, dtype=int) + + def insert(self, key, k): + for j in range(int(k)): + t = self.h[j](key) + self.table[t] = 1 + + def test(self, key, k): + test_result = 0 + match = 0 + for j in range(int(k)): + t = self.h[j](key) + match += 1 * (self.table[t] == 1) + if match == k: + test_result = 1 + return test_result + + +def R_size(count_key, count_nonkey, R0): + R = [0] * len(count_key) + R[0] = R0 + for k in range(1, len(count_key)): + R[k] = max( + int(count_key[k] * (np.log(count_nonkey[0] / count_nonkey[k]) / np.log(0.618) + R[0] / count_key[0])), 1) + return R + + +def Find_Optimal_Parameters(c_min, c_max, num_group_min, num_group_max, R_sum, train_negative, positive_sample): + c_set = np.arange(c_min, c_max + 10 ** (-6), 0.1) + FP_opt = train_negative.shape[0] + + k_min = 0 + for k_max in range(num_group_min, num_group_max + 1): + for c in c_set: + tau = sum(c ** np.arange(0, k_max - k_min + 1, 1)) + n = positive_sample.shape[0] + hash_len = R_sum + bloom_filter = Ada_BloomFilter(n, hash_len, k_max) + thresholds = np.zeros(k_max - k_min + 1) + thresholds[-1] = 1.1 + num_negative = sum(train_negative['score'] <= thresholds[-1]) + num_piece = int(num_negative / tau) + 1 + score = train_negative.loc[(train_negative['score'] <= thresholds[-1]), 'score'] + score = np.sort(score) + for k in range(k_min, k_max): + i = k - k_min + score_1 = score[score < thresholds[-(i + 1)]] + if int(num_piece * c ** i) < len(score_1): + thresholds[-(i + 2)] = score_1[-int(num_piece * c ** i)] + + url = positive_sample['url'] + score = positive_sample['score'] + + for score_s, url_s in zip(score, url): + ix = min(np.where(score_s < thresholds)[0]) + k = k_max - ix + bloom_filter.insert(url_s, k) + ML_positive = train_negative.loc[(train_negative['score'] >= thresholds[-2]), 'url'] + url_negative = train_negative.loc[(train_negative['score'] < thresholds[-2]), 'url'] + score_negative = train_negative.loc[(train_negative['score'] < thresholds[-2]), 'score'] + + test_result = np.zeros(len(url_negative)) + ss = 0 + for score_s, url_s in zip(score_negative, url_negative): + ix = min(np.where(score_s < thresholds)[0]) + # thres = thresholds[ix] + k = k_max - ix + test_result[ss] = bloom_filter.test(url_s, k) + ss += 1 + FP_items = sum(test_result) + len(ML_positive) + print('False positive items: %d, Number of groups: %d, c = %f' % (FP_items, k_max, round(c, 2))) + + if FP_opt > FP_items: + FP_opt = FP_items + bloom_filter_opt = bloom_filter + thresholds_opt = thresholds + k_max_opt = k_max + + # print('Optimal FPs: %f, Optimal c: %f, Optimal num_group: %d' % (FP_opt, c_opt, num_group_opt)) + return bloom_filter_opt, thresholds_opt, k_max_opt + + +def run(c_min, c_max, num_group_min, num_group_max, R_sum, path, model, X_query, y_query, query_urls): + data = pd.read_csv(path) + negative_sample = data.loc[(data['label'] == 0)] + positive_sample = data.loc[(data['label'] == 1)] + # train_negative = negative_sample.sample(frac=0.8) + train_negative = negative_sample + + bloom_filter_opt, thresholds_opt, k_max_opt = Find_Optimal_Parameters(c_min, c_max, num_group_min, num_group_max, + R_sum, train_negative, positive_sample) + fn = 0 + fp = 0 + cnt_ml = 0 + cnt_bf = 0 + total = len(X_query) + print(f"query count = {total}") + prediction_results = model.predict(X_query) + + for i in range(total): + true_label = y_query[i] + url = query_urls[i] + score = prediction_results[i] + if score >= thresholds_opt[-2]: + if true_label == 0: + fp += 1 + cnt_ml += 1 + else: + ix = min(np.where(score < thresholds_opt)[0]) + # thres = thresholds[ix] + k = k_max_opt - ix + if bloom_filter_opt.test(url, k) == 1 and true_label == 0: + fp += 1 + cnt_bf += 1 + elif bloom_filter_opt.test(url, k) == 0 and true_label == 1: + fn = fn + 1 + + print(f"fp: {fp}") + print(f"total: {total}") + print(f"fpr: {float(fp) / total}") + print(f"fnr: {float(fn) / total}") + print(f"cnt_ml: {cnt_ml}") + print(f"cnt_bf: {cnt_bf}") + return float(fp) / total + + +''' +Implement Ada-BF +''' +# if __name__ == '__main__': +# '''Stage 1: Find the hyper-parameters (spare 30% samples to find the parameters)''' +# bloom_filter_opt, thresholds_opt, k_max_opt = Find_Optimal_Parameters(c_min, c_max, num_group_min, num_group_max, +# R_sum, train_negative, positive_sample) +# +# '''Stage 2: Run Ada-BF on all the samples''' +# ### Test URLs +# ML_positive = negative_sample.loc[(negative_sample['score'] >= thresholds_opt[-2]), 'url'] +# url_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'url'] +# score_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'score'] +# test_result = np.zeros(len(url_negative)) +# ss = 0 +# for score_s, url_s in zip(score_negative, url_negative): +# ix = min(np.where(score_s < thresholds_opt)[0]) +# # thres = thresholds[ix] +# k = k_max_opt - ix +# test_result[ss] = bloom_filter_opt.test(url_s, k) +# ss += 1 +# FP_items = sum(test_result) + len(ML_positive) +# print('False positive items: %d' % FP_items) diff --git a/ada-bf/bf.py b/ada-bf/bf.py new file mode 100644 index 0000000..516ca2f --- /dev/null +++ b/ada-bf/bf.py @@ -0,0 +1,103 @@ +import numpy as np +import pandas as pd +from sklearn.utils import murmurhash3_32 +from random import randint +import argparse + + +def hashfunc(m): + ss = randint(1, 99999999) + + def hash_m(x): + return murmurhash3_32(x, seed=ss) % m + + return hash_m + + +''' +Class for Standard Bloom filter +''' + + +class BloomFilter: + def __init__(self, n, hash_len): + self.n = n + self.hash_len = int(hash_len) + if (self.n > 0) & (self.hash_len > 0): + self.k = max(1, int(self.hash_len / n * 0.6931472)) + elif self.n == 0: + self.k = 1 + self.h = [] + for i in range(self.k): + self.h.append(hashfunc(self.hash_len)) + self.table = np.zeros(self.hash_len, dtype=int) + + def insert(self, key): + if self.hash_len == 0: + raise SyntaxError('cannot insert to an empty hash table') + for i in key: + for j in range(self.k): + t = self.h[j](i) + self.table[t] = 1 + + # def test(self, key): + # test_result = 0 + # match = 0 + # if self.hash_len > 0: + # for j in range(self.k): + # t = self.h[j](key) + # match += 1*(self.table[t] == 1) + # if match == self.k: + # test_result = 1 + # return test_result + + def test(self, keys, single_key=True): + if single_key: + test_result = 0 + match = 0 + if self.hash_len > 0: + for j in range(self.k): + t = self.h[j](keys) + match += 1 * (self.table[t] == 1) + if match == self.k: + test_result = 1 + else: + test_result = np.zeros(len(keys)) + ss = 0 + if self.hash_len > 0: + for key in keys: + match = 0 + for j in range(self.k): + t = self.h[j](key) + match += 1 * (self.table[t] == 1) + if match == self.k: + test_result[ss] = 1 + ss += 1 + return test_result + + +'''Run Bloom filter''' + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--size_of_Ada_BF', action="store", dest="R_sum", type=int, required=True, + help="size of the Ada-BF") + + results = parser.parse_args() + DATA_PATH = results.data_path + R_sum = results.R_sum + + data = pd.read_csv(DATA_PATH) + + negative_sample = data.loc[(data['label'] == -1)] + positive_sample = data.loc[(data['label'] == 1)] + + url = positive_sample['url'] + n = len(url) + bloom_filter = BloomFilter(n, R_sum) + bloom_filter.insert(url) + url_negative = negative_sample['url'] + n1 = bloom_filter.test(url_negative, single_key=False) + print('False positive items: ', sum(n1)) diff --git a/ada-bf/disjoint_ada_bf.py b/ada-bf/disjoint_ada_bf.py new file mode 100644 index 0000000..a4e43e7 --- /dev/null +++ b/ada-bf/disjoint_ada_bf.py @@ -0,0 +1,237 @@ +import numpy as np +import pandas as pd +from bf import BloomFilter + + +# parser = argparse.ArgumentParser() +# parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, +# help="path of the dataset") +# parser.add_argument('--model_path', action="store", dest="model_path", type=str, required=True, +# help="path of the model") +# parser.add_argument('--num_group_min', action="store", dest="min_group", type=int, required=True, +# help="Minimum number of groups") +# parser.add_argument('--num_group_max', action="store", dest="max_group", type=int, required=True, +# help="Maximum number of groups") +# parser.add_argument('--size_of_Ada_BF', action="store", dest="M_budget", type=int, required=True, +# help="memory budget") +# parser.add_argument('--c_min', action="store", dest="c_min", type=float, required=True, +# help="minimum ratio of the keys") +# parser.add_argument('--c_max', action="store", dest="c_max", type=float, required=True, +# help="maximum ratio of the keys") +# +# results = parser.parse_args() +# DATA_PATH = results.data_path +# num_group_min = results.min_group +# num_group_max = results.max_group +# model_size = os.path.getsize(results.model_path) +# R_sum = results.M_budget - model_size * 8 +# c_min = results.c_min +# c_max = results.c_max +# +# # DATA_PATH = './URL_data.csv' +# # num_group_min = 8 +# # num_group_max = 12 +# # R_sum = 200000 +# # c_min = 1.8 +# # c_max = 2.1 +# +# +# ''' +# Load the data and select training data +# ''' +# data = pd.read_csv(DATA_PATH) +# negative_sample = data.loc[(data['label'] == -1)] +# positive_sample = data.loc[(data['label'] == 1)] +# train_negative = negative_sample.sample(frac=0.3) +# +# ''' +# Plot the distribution of scores +# ''' +# plt.style.use('seaborn-deep') +# +# x = data.loc[data['label'] == 1, 'score'] +# y = data.loc[data['label'] == -1, 'score'] +# bins = np.linspace(0, 1, 25) +# +# plt.hist([x, y], bins, log=True, label=['Keys', 'non-Keys']) +# plt.legend(loc='upper right') +# plt.savefig('./Score_Dist.png') +# plt.show() + + +def R_size(count_key, count_nonkey, R0): + R = [0] * len(count_key) + R[0] = max(R0, 1) + for k in range(1, len(count_key)): + R[k] = max( + int(count_key[k] * (np.log(count_nonkey[0] / count_nonkey[k]) / np.log(0.618) + R[0] / count_key[0])), 1) + return R + + +def Find_Optimal_Parameters(c_min, c_max, num_group_min, num_group_max, R_sum, train_negative, positive_sample): + c_set = np.arange(c_min, c_max + 10 ** (-6), 0.1) + FP_opt = train_negative.shape[0] + + for num_group in range(num_group_min, num_group_max + 1): + for c in c_set: + ### Determine the thresholds + thresholds = np.zeros(num_group + 1) + thresholds[0] = -0.1 + thresholds[-1] = 1.1 + num_negative = train_negative.shape[0] + tau = sum(c ** np.arange(0, num_group, 1)) + num_piece = int(num_negative / tau) + score = np.sort(np.array(list(train_negative['score']))) + + for i in range(1, num_group): + if thresholds[-i] > 0: + score_1 = score[score < thresholds[-i]] + if int(num_piece * c ** (i - 1)) <= len(score_1): + thresholds[-(i + 1)] = score_1[-int(num_piece * c ** (i - 1))] + else: + thresholds[-(i + 1)] = 0 + else: + thresholds[-(i + 1)] = 1 + + count_nonkey = np.zeros(num_group) + for j in range(num_group): + count_nonkey[j] = sum((score >= thresholds[j]) & (score < thresholds[j + 1])) + + num_group_1 = sum(count_nonkey > 0) + count_nonkey = count_nonkey[count_nonkey > 0] + thresholds = thresholds[-(num_group_1 + 1):] + + ### Count the keys of each group + url = positive_sample['url'] + score = positive_sample['score'] + + count_key = np.zeros(num_group_1) + url_group = [] + bloom_filter = [] + for j in range(num_group_1): + count_key[j] = sum((score >= thresholds[j]) & (score < thresholds[j + 1])) + url_group.append(url[(score >= thresholds[j]) & (score < thresholds[j + 1])]) + + ### Search the Bloom filters' size + R = np.zeros(num_group_1 - 1) + R[:] = 0.5 * R_sum + non_empty_ix = min(np.where(count_key > 0)[0]) + if non_empty_ix > 0: + R[0:non_empty_ix] = 0 + kk = 1 + while abs(sum(R) - R_sum) > 200: + if (sum(R) > R_sum): + R[non_empty_ix] = R[non_empty_ix] - int((0.5 * R_sum) * (0.5) ** kk + 1) + else: + R[non_empty_ix] = R[non_empty_ix] + int((0.5 * R_sum) * (0.5) ** kk + 1) + R[non_empty_ix:] = R_size(count_key[non_empty_ix:-1], count_nonkey[non_empty_ix:-1], R[non_empty_ix]) + if int((0.5 * R_sum) * (0.5) ** kk + 1) == 1: + break + kk += 1 + + Bloom_Filters = [] + for j in range(int(num_group_1 - 1)): + if j < non_empty_ix: + Bloom_Filters.append([0]) + else: + Bloom_Filters.append(BloomFilter(count_key[j], R[j])) + Bloom_Filters[j].insert(url_group[j]) + + ### Test URLs + ML_positive = train_negative.loc[(train_negative['score'] >= thresholds[-2]), 'url'] + url_negative = train_negative.loc[(train_negative['score'] < thresholds[-2]), 'url'] + score_negative = train_negative.loc[(train_negative['score'] < thresholds[-2]), 'score'] + + test_result = np.zeros(len(url_negative)) + ss = 0 + for score_s, url_s in zip(score_negative, url_negative): + ix = min(np.where(score_s < thresholds)[0]) - 1 + if ix >= non_empty_ix: + test_result[ss] = Bloom_Filters[ix].test(url_s) + else: + test_result[ss] = 0 + ss += 1 + FP_items = sum(test_result) + len(ML_positive) + print('False positive items: %d, Number of groups: %d, c = %f' % (FP_items, num_group, round(c, 2))) + if FP_opt > FP_items: + FP_opt = FP_items + Bloom_Filters_opt = Bloom_Filters + thresholds_opt = thresholds + non_empty_ix_opt = non_empty_ix + + return Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt + + +def run(c_min, c_max, num_group_min, num_group_max, R_sum, path, model, X_query, y_query, query_urls): + data = pd.read_csv(path) + negative_sample = data.loc[(data['label'] == 0)] + positive_sample = data.loc[(data['label'] == 1)] + train_negative = negative_sample.sample(frac=0.8) + + Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt = Find_Optimal_Parameters(c_min, c_max, num_group_min, + num_group_max, R_sum, train_negative, + positive_sample) + fn = 0 + fp = 0 + cnt_ml = 0 + cnt_bf = 0 + total = len(X_query) + print(f"query count = {total}") + prediction_results = model.predict(X_query) + + for i in range(total): + true_label = y_query[i] + url = query_urls[i] + score = prediction_results[i] + # ix = min(np.where(score < thresholds_opt)[0]) + # thres = thresholds[ix] + # k = k_max_opt - ix + if score >= thresholds_opt[-2]: + if true_label == 0: + fp += 1 + cnt_ml += 1 + else: + ix = min(np.where(score < thresholds_opt)[0]) - 1 + test_result = 0 + if ix >= non_empty_ix_opt: + test_result = Bloom_Filters_opt[ix].test(url) + if test_result == 1 and true_label == 0: + fp += 1 + cnt_bf += 1 + elif test_result == 0 and true_label == 1: + fn = fn + 1 + + print(f"fp: {fp}") + print(f"total: {total}") + print(f"fpr: {float(fp) / total}") + print(f"fnr: {float(fn) / total}") + print(f"cnt_ml: {cnt_ml}") + print(f"cnt_bf: {cnt_bf}") + return float(fp) / total + +# ''' +# Implement disjoint Ada-BF +# ''' +# if __name__ == '__main__': +# '''Stage 1: Find the hyper-parameters''' +# Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt = Find_Optimal_Parameters(c_min, c_max, num_group_min, +# num_group_max, R_sum, train_negative, +# positive_sample) +# +# '''Stage 2: Run Ada-BF on all the samples''' +# ### Test URLs +# ML_positive = negative_sample.loc[(negative_sample['score'] >= thresholds_opt[-2]), 'url'] +# url_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'url'] +# score_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'score'] +# test_result = np.zeros(len(url_negative)) +# ss = 0 +# for score_s, url_s in zip(score_negative, url_negative): +# ix = min(np.where(score_s < thresholds_opt)[0]) - 1 +# if ix >= non_empty_ix_opt: +# test_result[ss] = Bloom_Filters_opt[ix].test(url_s) +# else: +# test_result[ss] = 0 +# ss += 1 +# FP_items = sum(test_result) + len(ML_positive) +# FPR = FP_items / len(url_negative) +# print('False positive items: {}; FPR: {}; Size of quries: {}'.format(FP_items, FPR, len(url_negative))) diff --git a/ada-bf/main.py b/ada-bf/main.py new file mode 100644 index 0000000..024b9ac --- /dev/null +++ b/ada-bf/main.py @@ -0,0 +1,78 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd + +import lib.lgb_url +import ada_bf +import disjoint_ada_bf + +df_train = pd.read_csv('../dataset/url_train.csv') +df_test = pd.read_csv('../dataset/url_test.csv') +df_query = pd.read_csv('../dataset/url_query.csv') + +train_urls = df_train['url'] +test_urls = df_test['url'] +query_urls = df_query['url'] + +X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_train = df_train['url_type'].values.astype(np.float32) +X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_test = df_test['url_type'].values.astype(np.float32) +X_query = df_query.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_query = df_query['url_type'].values.astype(np.float32) + +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) +n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] +n_test = len(df_test) + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) + +train_results = pd.DataFrame({ + 'url': train_urls, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_urls, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) + +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + ada_bf.run( + num_group_min=8, + num_group_max=12, + R_sum=bloom_size*8, + c_min=1.6, + c_max=2.5, + path='url_results.csv', + model=bst, + X_query=X_query, + y_query=y_query, + query_urls=query_urls + ) + # disjoint_ada_bf.run( + # num_group_min=8, + # num_group_max=12, + # R_sum=bloom_size*8, + # c_min=1.6, + # c_max=2.5, + # path='url_results.csv', + # model=bst, + # X_query=X_query, + # y_query=y_query, + # query_urls=query_urls + # ) + size *= 2 diff --git a/ada-bf/main_yelp.py b/ada-bf/main_yelp.py new file mode 100644 index 0000000..b3ca7cf --- /dev/null +++ b/ada-bf/main_yelp.py @@ -0,0 +1,116 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.lgb_url +import ada_bf +import disjoint_ada_bf + +import lib.network +import lib.data_processing +import lib.lgb_url +import lib.bf_util + + +data_train = pd.read_csv('../dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('../dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('../dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + #print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + #print(X) + return X, y, insert + + +X_train, y_train, train_insert = yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict) +X_test, y_test, test_insert = yelp_embedding(data_test, word_dict=word_dict, region_dict=region_dict) +X_query, y_query, query_insert = yelp_embedding(data_query, word_dict=word_dict, region_dict=region_dict) + +n_true = data_train[data_train['is_in'] == 1].shape[0] + data_test[data_test['is_in'] == 1].shape[0] +n_test = len(data_test) +# 清理内存 + + + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) + +train_results = pd.DataFrame({ + 'url': train_insert, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_insert, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) + +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +initial_size = 32 * 1024 +max_size = 512 * 1024 + +# 循环,从32开始,每次乘以2,直到512 +size = initial_size +while size <= max_size: + bloom_size = size - model_size + # ada_bf.run( + # num_group_min=8, + # num_group_max=12, + # R_sum=bloom_size*8, + # c_min=1.6, + # c_max=2.5, + # path='url_results.csv', + # model=bst, + # X_query=X_query, + # y_query=y_query, + # query_urls=query_insert + # ) + disjoint_ada_bf.run( + num_group_min=8, + num_group_max=12, + R_sum=bloom_size*8, + c_min=1.6, + c_max=2.5, + path='url_results.csv', + model=bst, + X_query=X_query, + y_query=y_query, + query_urls=query_insert + ) + size *= 2 diff --git a/ada-bf/yelp_main.py b/ada-bf/yelp_main.py new file mode 100644 index 0000000..e506276 --- /dev/null +++ b/ada-bf/yelp_main.py @@ -0,0 +1,131 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.lgb_url +import ada_bf +import disjoint_ada_bf + +data_train = pd.read_csv('../dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('../dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('../dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num + + +X_train, y_train, train_insert, train_true, train_false = yelp_embedding(data_train, word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false = yelp_embedding(data_test, word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false = yelp_embedding(data_query, word_dict=word_dict, + region_dict=region_dict) +print(query_insert) +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + + + +# 3. 划分训练集和测试集 +X_train = X_train.values.astype(np.float32) +X_test = X_test.values.astype(np.float32) +y_train = y_train.values.astype(np.float32) +y_test = y_test.values.astype(np.float32) +query_data = lgb.Dataset(X_query, label=y_query, free_raw_data=False) + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) +y_pred_query = bst.predict(X_query) + +train_results = pd.DataFrame({ + 'url': train_insert, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_insert, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) +query_results = pd.DataFrame({ + 'url': query_insert, + 'label': y_query, + 'score': y_pred_query +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) + +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + ada_bf.run( + num_group_min=8, + num_group_max=12, + R_sum=bloom_size * 8, + c_min=1.6, + c_max=2.5, + path='url_results.csv', + model=bst, + X_query=X_query, + y_query=y_query, + query_urls=query_insert + ) + # disjoint_ada_bf.run( + # num_group_min=8, + # num_group_max=12, + # R_sum=bloom_size*8, + # c_min=1.6, + # c_max=2.5, + # path='url_results.csv', + # model=bst, + # X_query=X_query, + # y_query=y_query, + # query_urls=query_urls + # ) + size *= 2 diff --git a/best_bst_20480 b/best_bst_20480 new file mode 100644 index 0000000..7886974 --- /dev/null +++ b/best_bst_20480 @@ -0,0 +1,425 @@ +tree +version=v4 +num_class=1 +num_tree_per_iteration=1 +label_index=0 +max_feature_idx=371 +objective=binary sigmoid:1 +feature_names=Column_0 Column_1 Column_2 Column_3 Column_4 Column_5 Column_6 Column_7 Column_8 Column_9 Column_10 Column_11 Column_12 Column_13 Column_14 Column_15 Column_16 Column_17 Column_18 Column_19 Column_20 Column_21 Column_22 Column_23 Column_24 Column_25 Column_26 Column_27 Column_28 Column_29 Column_30 Column_31 Column_32 Column_33 Column_34 Column_35 Column_36 Column_37 Column_38 Column_39 Column_40 Column_41 Column_42 Column_43 Column_44 Column_45 Column_46 Column_47 Column_48 Column_49 Column_50 Column_51 Column_52 Column_53 Column_54 Column_55 Column_56 Column_57 Column_58 Column_59 Column_60 Column_61 Column_62 Column_63 Column_64 Column_65 Column_66 Column_67 Column_68 Column_69 Column_70 Column_71 Column_72 Column_73 Column_74 Column_75 Column_76 Column_77 Column_78 Column_79 Column_80 Column_81 Column_82 Column_83 Column_84 Column_85 Column_86 Column_87 Column_88 Column_89 Column_90 Column_91 Column_92 Column_93 Column_94 Column_95 Column_96 Column_97 Column_98 Column_99 Column_100 Column_101 Column_102 Column_103 Column_104 Column_105 Column_106 Column_107 Column_108 Column_109 Column_110 Column_111 Column_112 Column_113 Column_114 Column_115 Column_116 Column_117 Column_118 Column_119 Column_120 Column_121 Column_122 Column_123 Column_124 Column_125 Column_126 Column_127 Column_128 Column_129 Column_130 Column_131 Column_132 Column_133 Column_134 Column_135 Column_136 Column_137 Column_138 Column_139 Column_140 Column_141 Column_142 Column_143 Column_144 Column_145 Column_146 Column_147 Column_148 Column_149 Column_150 Column_151 Column_152 Column_153 Column_154 Column_155 Column_156 Column_157 Column_158 Column_159 Column_160 Column_161 Column_162 Column_163 Column_164 Column_165 Column_166 Column_167 Column_168 Column_169 Column_170 Column_171 Column_172 Column_173 Column_174 Column_175 Column_176 Column_177 Column_178 Column_179 Column_180 Column_181 Column_182 Column_183 Column_184 Column_185 Column_186 Column_187 Column_188 Column_189 Column_190 Column_191 Column_192 Column_193 Column_194 Column_195 Column_196 Column_197 Column_198 Column_199 Column_200 Column_201 Column_202 Column_203 Column_204 Column_205 Column_206 Column_207 Column_208 Column_209 Column_210 Column_211 Column_212 Column_213 Column_214 Column_215 Column_216 Column_217 Column_218 Column_219 Column_220 Column_221 Column_222 Column_223 Column_224 Column_225 Column_226 Column_227 Column_228 Column_229 Column_230 Column_231 Column_232 Column_233 Column_234 Column_235 Column_236 Column_237 Column_238 Column_239 Column_240 Column_241 Column_242 Column_243 Column_244 Column_245 Column_246 Column_247 Column_248 Column_249 Column_250 Column_251 Column_252 Column_253 Column_254 Column_255 Column_256 Column_257 Column_258 Column_259 Column_260 Column_261 Column_262 Column_263 Column_264 Column_265 Column_266 Column_267 Column_268 Column_269 Column_270 Column_271 Column_272 Column_273 Column_274 Column_275 Column_276 Column_277 Column_278 Column_279 Column_280 Column_281 Column_282 Column_283 Column_284 Column_285 Column_286 Column_287 Column_288 Column_289 Column_290 Column_291 Column_292 Column_293 Column_294 Column_295 Column_296 Column_297 Column_298 Column_299 Column_300 Column_301 Column_302 Column_303 Column_304 Column_305 Column_306 Column_307 Column_308 Column_309 Column_310 Column_311 Column_312 Column_313 Column_314 Column_315 Column_316 Column_317 Column_318 Column_319 Column_320 Column_321 Column_322 Column_323 Column_324 Column_325 Column_326 Column_327 Column_328 Column_329 Column_330 Column_331 Column_332 Column_333 Column_334 Column_335 Column_336 Column_337 Column_338 Column_339 Column_340 Column_341 Column_342 Column_343 Column_344 Column_345 Column_346 Column_347 Column_348 Column_349 Column_350 Column_351 Column_352 Column_353 Column_354 Column_355 Column_356 Column_357 Column_358 Column_359 Column_360 Column_361 Column_362 Column_363 Column_364 Column_365 Column_366 Column_367 Column_368 Column_369 Column_370 Column_371 +feature_infos=[0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1330] [0:72] [0:173] [0:562] [0:2853] [0:50] [0:838] [0:1628] [0:63] [0:1089] [0:3129] [0:53] [0:810] [0:33] [0:61] [0:333] [0:2101] [0:19] [0:474] [0:984] [0:42] [0:834] [0:1488] [0:50] [-0.70703125:0.7109375] [-0.65234375:0.78125] [-0.77734375:0.6484375] [-0.69140625:0.890625] [-0.80078125:0.63671875] [-0.77734375:0.796875] [-0.59375:0.703125] [-0.9296875:0.6796875] [-0.73828125:0.77734375] [-0.55078125:0.890625] [-0.875:0.69140625] [-0.78515625:0.60546875] [-0.74609375:0.8515625] [-0.6328125:0.8359375] [-0.82421875:0.75390625] [-0.5859375:0.7734375] [-0.734375:0.94140625] [-0.7265625:0.82421875] [-1.0234375:0.81640625] [-0.796875:0.6875] [-0.6484375:1.0546875] [-0.83203125:0.6875] [-0.63671875:0.8671875] [-0.70703125:0.6875] [-0.73046875:0.796875] [-0.87109375:0.6953125] [-0.8125:0.59765625] [-0.83203125:0.7265625] [-0.859375:0.765625] [-0.88671875:0.84765625] [-1.0078125:0.609375] [-0.671875:0.63671875] [-0.7421875:0.734375] [-0.8046875:0.65625] [-0.74609375:0.734375] [-0.99609375:0.66796875] [-0.6015625:0.83203125] [-0.73046875:0.80078125] [-0.78125:0.81640625] [-0.734375:0.79296875] [-0.66796875:0.75390625] [-0.78515625:0.63671875] [-0.58984375:0.80078125] [-0.72265625:0.80078125] [-0.76953125:0.73046875] [-0.8125:0.578125] [-0.73046875:0.640625] [-0.70703125:0.74609375] [-0.8125:0.87890625] [-0.609375:0.90625] [-0.9140625:0.66015625] [-0.76171875:0.68359375] [-0.72265625:0.76171875] [-0.7734375:0.71484375] [-0.7421875:0.75] [-0.6953125:0.640625] [-0.796875:0.65234375] [-0.84765625:0.66015625] [-0.76171875:0.65625] [-0.80859375:0.6953125] [-0.89453125:0.77734375] [-0.6640625:1.03125] [-0.890625:0.6328125] [-0.75:0.64453125] [-0.73046875:0.8203125] [-0.8671875:0.7890625] [-0.98046875:0.5703125] [-0.859375:0.7421875] [-0.66796875:0.7265625] [-0.5546875:0.87109375] [-0.625:0.7734375] [-0.7734375:0.7421875] [-0.94921875:0.7578125] [-0.85546875:0.75] [-0.89453125:0.447265625] [-0.890625:0.703125] [-0.921875:0.59375] [-0.91015625:0.65625] [-0.62890625:0.7421875] [-0.76953125:0.66796875] [-0.8359375:0.66015625] [-0.82421875:0.6640625] [-0.71875:0.578125] [-0.75390625:0.75] [-0.76171875:0.76171875] [-0.6484375:0.79296875] [-0.74609375:0.72265625] [-0.5546875:0.828125] [-0.609375:0.75390625] [-0.7109375:0.6796875] [-0.82421875:0.7421875] [-0.72265625:0.73046875] [-0.7734375:0.57421875] [-0.77734375:0.828125] [-0.71484375:0.6328125] [-1.1796875:0.7109375] [-0.6328125:0.734375] [-0.81640625:0.8671875] [-0.625:0.8671875] [-0.8125:0.64453125] [-0.72265625:0.703125] [-0.98828125:0.84375] [-0.5625:0.7890625] [-0.76953125:0.6796875] [-0.625:0.72265625] [-0.86328125:0.62109375] [-0.82421875:0.9140625] [-0.6953125:0.82421875] [-0.796875:0.84375] [-0.76171875:0.57421875] [-0.85546875:0.73828125] [-0.8984375:0.859375] [-0.65234375:0.76171875] [-0.77734375:0.875] [-0.57421875:0.77734375] [-0.75:0.65234375] [-0.76171875:0.6953125] [-0.75:0.7734375] [-0.7421875:0.8359375] [-0.9140625:0.6953125] [-0.75390625:0.66796875] [-0.84765625:0.6484375] [-0.84375:0.6640625] [-0.671875:0.75390625] [-0.58203125:0.9609375] [-0.81640625:0.71484375] [-0.875:0.7265625] [-0.640625:0.8984375] [-0.83203125:0.81640625] [-0.73828125:0.7265625] [-0.7421875:0.61328125] [-0.72265625:0.71484375] [-0.86328125:0.66796875] [-0.6875:0.82421875] [-0.7890625:0.58203125] [-0.640625:0.796875] [-0.9296875:0.72265625] [-0.7265625:0.96484375] [-0.82421875:0.828125] [-0.5625:1.09375] [-0.8203125:0.85546875] [-0.73046875:0.72265625] [-0.71484375:0.64453125] [-0.73046875:0.73046875] [-0.79296875:0.83984375] [-0.83203125:0.83203125] [-0.765625:0.72265625] [-0.76171875:0.80859375] [-0.95703125:0.6484375] [-0.91015625:0.6796875] [-0.4765625:0.92578125] [-0.8671875:0.703125] [-1.1640625:0.6796875] [-0.71875:0.89453125] [-0.64453125:0.64453125] [-0.8671875:0.68359375] [-0.79296875:0.75] [-0.72265625:0.90234375] [-1.0625:0.61328125] [-0.67578125:0.75] [-0.6796875:0.66796875] [-0.7578125:0.7734375] [-0.6640625:0.81640625] [-0.97265625:0.71875] [-0.68359375:0.6484375] [-0.796875:0.69140625] [-0.7734375:1.0859375] [-0.7109375:0.62890625] [-0.85546875:0.734375] [-0.62890625:0.71484375] [-0.81640625:0.66796875] [-0.74609375:0.76171875] [-0.64453125:0.6875] [-0.6796875:0.69921875] [-0.87890625:0.609375] [-0.828125:0.640625] [-0.66796875:0.7890625] [-0.796875:0.671875] [-0.9765625:0.7421875] [-0.77734375:0.671875] [-1.1796875:0.58203125] [-0.6796875:0.6171875] [-0.6484375:0.83984375] [-0.89453125:0.75390625] [-0.59765625:0.66796875] [-0.71484375:0.71484375] [-0.71484375:0.6875] [-0.75:0.70703125] [-0.78125:0.79296875] [-0.609375:0.7734375] [-0.74609375:0.7890625] [-0.875:0.75] [-1.015625:0.7421875] [-0.69140625:0.75] [-0.64453125:0.8125] [-0.609375:0.91796875] [-0.7578125:0.64453125] [-0.73828125:0.68359375] [-0.7734375:0.875] [-0.77734375:0.625] [-0.87109375:0.90234375] [-0.75:0.80078125] [-0.83203125:0.82421875] [-0.796875:0.6796875] [-0.76171875:0.765625] [-0.83203125:0.59375] [-0.72265625:0.73828125] [-0.80078125:0.8828125] [-0.7578125:0.8203125] [-0.71484375:0.8828125] [-1:0.64453125] [-0.6953125:0.7109375] [-0.80078125:0.7734375] [-0.76953125:0.8984375] [-0.7890625:0.8828125] [-0.73828125:0.75] [-0.89453125:0.73046875] [-0.765625:0.765625] [-0.796875:0.73046875] [-0.828125:0.78515625] [-0.73828125:0.66796875] [-0.63671875:0.8984375] [-0.67578125:0.79296875] [-0.80859375:0.7578125] [-0.6875:0.80859375] [-0.61328125:0.74609375] [-0.734375:0.890625] [-0.7109375:0.9296875] [-0.82421875:0.78125] [-0.84375:0.73828125] [-0.69921875:0.72265625] [-0.6953125:0.6328125] [-0.734375:0.765625] [-0.796875:0.875] [-0.72265625:0.78515625] [-0.84375:0.73828125] [-0.6640625:0.92578125] [-0.87109375:0.59375] [-0.6875:0.71484375] [-0.71875:0.75390625] [-0.53515625:0.85546875] [-0.9765625:0.6328125] [-0.65625:0.84375] [-0.9140625:0.58203125] [-0.71875:0.64453125] [-0.890625:0.65234375] [-0.78125:0.63671875] [-0.6875:0.99609375] [-0.67578125:0.6328125] [-0.8046875:0.8125] [-0.7890625:0.81640625] [-0.8359375:0.76171875] [-0.796875:0.83984375] [-0.796875:0.83984375] [-0.6015625:0.9921875] [-0.8125:0.67578125] [-0.67578125:0.82421875] [-0.84375:0.6953125] [-0.80078125:0.625] [-0.78515625:0.6328125] [-0.68359375:0.74609375] [-0.71484375:0.76171875] [-0.98046875:0.80078125] [-0.828125:0.73046875] [-0.99609375:0.6875] [-0.796875:0.9765625] [-0.94140625:0.72265625] [-0.828125:0.66015625] [-0.9375:0.6171875] [-0.7890625:0.53515625] [-0.71484375:0.58203125] [-0.56640625:0.765625] [-0.7265625:0.74609375] [-0.70703125:0.796875] [-0.70703125:0.78515625] [-0.7265625:0.70703125] [-0.83984375:0.82421875] [-0.87890625:0.68359375] [-0.86328125:0.640625] [-0.6953125:0.80859375] [-0.66796875:0.63671875] [-0.62109375:0.7734375] [-0.73046875:0.7734375] [-0.6875:0.98046875] [-0.65234375:0.66796875] [-0.8671875:0.6484375] [-0.84375:0.87890625] [-0.6484375:0.78125] [-0.6796875:0.84375] [-0.640625:0.91015625] [-0.6953125:0.625] [-0.71875:0.76171875] [-0.70703125:0.57421875] [-0.828125:0.8046875] [-0.65234375:0.76953125] [-0.765625:0.70703125] [-0.78125:0.78125] [-0.73046875:0.77734375] [-0.74609375:0.8828125] [-0.671875:0.765625] +tree_sizes=355 353 353 353 353 354 353 354 353 353 354 355 355 355 355 + +Tree=0 +num_leaves=2 +num_cat=0 +split_feature=60 +split_gain=434984 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=0.51596630501956908 0.73185020954786251 +leaf_weight=34976.231331303716 70093.534802645445 +leaf_count=155701 312030 +internal_value=0.659986 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=1 + + +Tree=1 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=385732 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.12954318817138807 0.071017685330332536 +leaf_weight=37093.395588040352 67780.437428444624 +leaf_count=158726 309005 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=2 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=344753 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.11944689247612593 0.069287281783356455 +leaf_weight=38186.443996280432 66043.623538717628 +leaf_count=158726 309005 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=3 +num_leaves=2 +num_cat=0 +split_feature=60 +split_gain=311052 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.11300179071450352 0.066758147395448705 +leaf_weight=38218.333420440555 64983.678175449371 +leaf_count=155701 312030 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=4 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=282019 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.10456937510987918 0.066276112699463027 +leaf_weight=39380.240540534258 62478.739326238632 +leaf_count=158726 309005 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=5 +num_leaves=2 +num_cat=0 +split_feature=60 +split_gain=257401 +threshold=3.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.099164731223329375 0.064818720206912719 +leaf_weight=39471.431980535388 60779.229097545147 +leaf_count=158493 309238 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=6 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=235592 +threshold=1.5000000000000002 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.095518925887709522 0.062735238584165104 +leaf_weight=38850.563534662127 59587.91718865931 +leaf_count=155633 312098 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=7 +num_leaves=2 +num_cat=0 +split_feature=60 +split_gain=216481 +threshold=3.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.090021709146951179 0.062422098839859545 +leaf_weight=39308.151290565729 57143.905157417059 +leaf_count=158493 309238 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=8 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=199450 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.086096234599393709 0.06149025426265723 +leaf_weight=39100.364814996719 55222.588810950518 +leaf_count=158726 309005 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=9 +num_leaves=2 +num_cat=0 +split_feature=60 +split_gain=184172 +threshold=3.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.083008767702045239 0.060331798293113095 +leaf_weight=38546.52131408453 53527.362207323313 +leaf_count=158493 309238 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=10 +num_leaves=2 +num_cat=0 +split_feature=60 +split_gain=170303 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.08118643459991752 0.058525067147374148 +leaf_weight=37382.900906741619 52367.711843729019 +leaf_count=155701 312030 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=11 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=158571 +threshold=1.5000000000000002 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.078789646538701255 0.057687193872347775 +leaf_weight=36708.809957891703 50650.651836320758 +leaf_count=155633 312098 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=12 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=147179 +threshold=2.5000000000000004 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.075068618436233622 0.057787518397969723 +leaf_weight=36724.755574867129 48212.871220514178 +leaf_count=158726 309005 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=13 +num_leaves=2 +num_cat=0 +split_feature=60 +split_gain=137101 +threshold=4.5000000000000009 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.072338606191567192 0.057519075243569077 +leaf_weight=36312.050563037395 46168.743419364095 +leaf_count=161328 306403 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +Tree=14 +num_leaves=2 +num_cat=0 +split_feature=48 +split_gain=127824 +threshold=1.5000000000000002 +decision_type=2 +left_child=-1 +right_child=-2 +leaf_value=-0.072383765528558658 0.055239975939967857 +leaf_weight=34423.690812692046 45620.882858976722 +leaf_count=155633 312098 +internal_value=0 +internal_weight=0 +internal_count=467731 +is_linear=0 +shrinkage=0.05 + + +end of trees + +feature_importances: +Column_48=8 +Column_60=7 + +parameters: +[boosting: gbdt] +[objective: binary] +[metric: binary_logloss] +[tree_learner: serial] +[device_type: cpu] +[data_sample_strategy: bagging] +[data: ] +[valid: ] +[num_iterations: 15] +[learning_rate: 0.05] +[num_leaves: 2] +[num_threads: 0] +[seed: 0] +[deterministic: 0] +[force_col_wise: 0] +[force_row_wise: 0] +[histogram_pool_size: -1] +[max_depth: -1] +[min_data_in_leaf: 20] +[min_sum_hessian_in_leaf: 0.001] +[bagging_fraction: 1] +[pos_bagging_fraction: 1] +[neg_bagging_fraction: 1] +[bagging_freq: 0] +[bagging_seed: 3] +[feature_fraction: 0.9] +[feature_fraction_bynode: 1] +[feature_fraction_seed: 2] +[extra_trees: 0] +[extra_seed: 6] +[early_stopping_round: 0] +[first_metric_only: 0] +[max_delta_step: 0] +[lambda_l1: 0] +[lambda_l2: 0] +[linear_lambda: 0] +[min_gain_to_split: 0] +[drop_rate: 0.1] +[max_drop: 50] +[skip_drop: 0.5] +[xgboost_dart_mode: 0] +[uniform_drop: 0] +[drop_seed: 4] +[top_rate: 0.2] +[other_rate: 0.1] +[min_data_per_group: 100] +[max_cat_threshold: 32] +[cat_l2: 10] +[cat_smooth: 10] +[max_cat_to_onehot: 4] +[top_k: 20] +[monotone_constraints: ] +[monotone_constraints_method: basic] +[monotone_penalty: 0] +[feature_contri: ] +[forcedsplits_filename: ] +[refit_decay_rate: 0.9] +[cegb_tradeoff: 1] +[cegb_penalty_split: 0] +[cegb_penalty_feature_lazy: ] +[cegb_penalty_feature_coupled: ] +[path_smooth: 0] +[interaction_constraints: ] +[verbosity: 1] +[saved_feature_importance_type: 0] +[use_quantized_grad: 0] +[num_grad_quant_bins: 4] +[quant_train_renew_leaf: 0] +[stochastic_rounding: 1] +[linear_tree: 0] +[max_bin: 255] +[max_bin_by_feature: ] +[min_data_in_bin: 3] +[bin_construct_sample_cnt: 200000] +[data_random_seed: 1] +[is_enable_sparse: 1] +[enable_bundle: 1] +[use_missing: 1] +[zero_as_missing: 0] +[feature_pre_filter: 1] +[pre_partition: 0] +[two_round: 0] +[header: 0] +[label_column: ] +[weight_column: ] +[group_column: ] +[ignore_column: ] +[categorical_feature: ] +[forcedbins_filename: ] +[precise_float_parser: 0] +[parser_config_file: ] +[objective_seed: 5] +[num_class: 1] +[is_unbalance: 0] +[scale_pos_weight: 1] +[sigmoid: 1] +[boost_from_average: 1] +[reg_sqrt: 0] +[alpha: 0.9] +[fair_c: 1] +[poisson_max_delta_step: 0.7] +[tweedie_variance_power: 1.5] +[lambdarank_truncation_level: 30] +[lambdarank_norm: 1] +[label_gain: ] +[lambdarank_position_bias_regularization: 0] +[eval_at: ] +[multi_error_top_k: 1] +[auc_mu_weights: ] +[num_machines: 1] +[local_listen_port: 12400] +[time_out: 120] +[machine_list_filename: ] +[machines: ] +[gpu_platform_id: -1] +[gpu_device_id: -1] +[gpu_use_dp: 0] +[num_gpu: 1] + +end of parameters + +pandas_categorical:null diff --git a/best_higgs_bf_3000 b/best_higgs_bf_3000 deleted file mode 100644 index 22ca332..0000000 Binary files a/best_higgs_bf_3000 and /dev/null differ diff --git a/best_higgs_bf_3000.pkl b/best_higgs_bf_3000.pkl deleted file mode 100644 index d64b346..0000000 Binary files a/best_higgs_bf_3000.pkl and /dev/null differ diff --git a/best_higgs_model_50kb.pth b/best_higgs_model_50kb.pth deleted file mode 100644 index 2257b84..0000000 Binary files a/best_higgs_model_50kb.pth and /dev/null differ diff --git a/best_tweet_model.pth b/best_tweet_model.pth deleted file mode 100644 index 7cb188e..0000000 Binary files a/best_tweet_model.pth and /dev/null differ diff --git a/best_url_model.pth b/best_url_model.pth new file mode 100644 index 0000000..f6777f7 Binary files /dev/null and b/best_url_model.pth differ diff --git a/main_url_bf.py b/bf_url_main.py similarity index 87% rename from main_url_bf.py rename to bf_url_main.py index 5ffd903..0e2798d 100644 --- a/main_url_bf.py +++ b/bf_url_main.py @@ -41,15 +41,8 @@ # 组合训练集和测试集的url_type为1的url数据 combined_data = np.concatenate((df_train.loc[id_train, 'url'].values, df_test.loc[id_test, 'url'].values), axis=0) -# 定义布隆过滤器初始大小 -initial_size = 32 -max_size = 512 - -# 循环,从32开始,每次乘以2,直到256 -size = initial_size -while size <= max_size: - bloom_size = size * 1024 - bloom_filter = lib.bf_util.create_bloom_filter(dataset=combined_data, bf_size=bloom_size) +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_filter = lib.bf_util.create_bloom_filter(dataset=combined_data, bf_size=size) # 统计假阳性率 fp = 0 @@ -70,5 +63,4 @@ fn = fn + 1 print(f'error for url {url}') - print(f'fpr: {fp / total_neg}') - size *= 2 \ No newline at end of file + print(f'fpr: {fp / total_neg}') \ No newline at end of file diff --git a/bf_yelp_main.py b/bf_yelp_main.py new file mode 100644 index 0000000..4cb8165 --- /dev/null +++ b/bf_yelp_main.py @@ -0,0 +1,95 @@ +import numpy as np +import pandas as pd +from datetime import datetime +import math +import lib.bf_util + +# 加载数据集 +data_train = pd.read_csv('dataset/tweet/tweet_train.csv') +data_test = pd.read_csv('dataset/tweet/tweet_test.csv') +data_query = pd.read_csv('dataset/tweet/tweet_query.csv') + + + +def cal_region_id(lon, lat, x_min=27, x_max=54, y_min=-120, y_max=-74, one_kilo=0.009): + lon, lat = float(lon), float(lat) + lonTotal = math.ceil((y_max - y_min) / one_kilo) + if x_min <= lat <= x_max and y_min <= lon <= y_max: + x_num = math.ceil((lat - x_min) / one_kilo) + y_num = math.ceil((lon - y_min) / one_kilo) + square_num = x_num * lonTotal + y_num + return square_num + else: + return None + + +def insert(ck): + time = ck['timestamp'] + keywords = ck['keywords'] + lat = ck['lat'] + lon = ck['lon'] + time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') + time_bucket = time.hour * 2 + time.minute // 30 + time = str(time.year) + str(time.month).zfill(2) + str(time.day).zfill(2) + str(time_bucket).zfill(2) + region_id = str(cal_region_id(lat=lat, lon=lon)).zfill(8) + try: + keywords = keywords.replace(" ", "") + except AttributeError: + keywords = '' + ck['insert'] = time + region_id + keywords + return ck['insert'] + + +def yelp_insert(data_train): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + data_train = data_train[data_train['is_in'] == 1] + + data_train['insert'] = data_train.apply(insert, axis=1) + data = data_train[['insert', 'is_in']] + return data + + +def query_insert(data_train): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + data_train['insert'] = data_train.apply(insert, axis=1) + data = data_train[['insert', 'is_in']] + return data + + +insert_train = yelp_insert(data_train) +insert_test = yelp_insert(data_test) +insert_query = query_insert(data_query) +combined_data = np.concatenate([insert_train, insert_test], axis=0) +print(combined_data) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_filter = lib.bf_util.create_bloom_filter(dataset=combined_data, bf_size=size) + + # 统计假阳性率 + fp = 0 + fn = 0 + total_neg = 0 + # 遍历df_query中的每一个url列来查询布隆过滤器 + for index, row in insert_query.iterrows(): + url = row['insert'] + true_label = row['is_in'] # 0为负例,1为正例 + + if true_label == 0: + total_neg += 1 + if url in bloom_filter: + fp = fp + 1 + else: + print('contain positive query') + if url not in bloom_filter: + fn = fn + 1 + print(f'error for url {url}') + + print(f'fpr: {fp / total_neg}') diff --git a/bst_model.pkl b/bst_model.pkl deleted file mode 100644 index 111fb8e..0000000 Binary files a/bst_model.pkl and /dev/null differ diff --git a/bst_model.txt b/bst_model.txt deleted file mode 100644 index fafcf9b..0000000 --- a/bst_model.txt +++ /dev/null @@ -1,336 +0,0 @@ -tree -version=v4 -num_class=1 -num_tree_per_iteration=1 -label_index=0 -max_feature_idx=9 -objective=binary sigmoid:1 -feature_names=Column_0 Column_1 Column_2 Column_3 Column_4 Column_5 Column_6 Column_7 Column_8 Column_9 -feature_infos=[-1.255795955657959:48.287014007568359] [-1.3897253274917603:64.4449462890625] [-0.47082158923149109:58.829273223876953] [-1.2269440889358521:21.345249176025391] [-0.019876999780535698:50.309398651123047] [-0.18856589496135712:5.3031864166259766] [-0.17101302742958069:5.8475074768066406] [-1.9647719860076904:3.1470139026641846] [-1.6814316511154175:1.7693653106689453] [-0.77622032165527344:1.2882939577102661] -tree_sizes=6832 6880 6901 6896 6880 6900 6907 6894 6911 6923 - -Tree=0 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 1 7 8 8 7 3 0 7 1 2 2 8 7 0 2 3 0 2 3 2 3 7 7 2 8 1 3 1 3 7 7 7 1 8 8 8 8 2 0 8 8 8 8 3 3 1 0 3 2 0 3 0 8 8 8 3 8 0 -split_gain=124394 19623.4 10951.7 8051.69 2504.65 2243.99 2077.03 3541.67 2013.45 1217.79 1081.17 1076.16 1118.1 1345.04 1318.01 961.082 914.213 649.842 619.176 650.678 1594.72 703.221 602.358 505.292 439.228 400.853 395.307 385.799 375.169 308.236 284.619 270.269 264.084 254.625 247.014 246.376 242.893 269.247 339.815 232.928 561.532 231.528 224.613 214.777 411.813 221.563 215.428 211.708 276.725 257.977 198.898 329.891 266.428 205.091 198.176 325.708 195.816 237.361 180.134 174.843 174.307 217.558 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.33242557942867274 0.41995026171207434 0.47061920166015631 -0.29721513390541071 -0.96772852540016163 -0.96778246760368336 -0.3375259786844253 5.4391126632690439 0.077961251139640822 0.1215791590511799 1.8850960135459902 1.0000000180025095e-35 -0.42596523463726038 -1.6652725934982298 -0.36696811020374293 0.9215342402458192 -0.15682714059948918 -0.89694124460220326 -0.60601681470870961 -0.42596523463726038 0.8190734088420869 -0.24653984606266019 0.42307002842426306 2.3508639335632329 -1.921579837799072 0.74029985070228588 -1.6671294569969175 -0.574863702058792 0.8190734088420869 -0.29811820387840265 -0.89694124460220326 0.084623351693153395 -0.38312204182147974 -0.36696811020374293 -0.79011020064353932 0.62498643994331371 0.32180465757846838 1.1510444879531863 1.1580279469490053 -0.15682714059948918 -0.87960800528526295 1.5684867501258852 1.0512721538543703 -0.77818816900253285 -1.569857597351074 -0.89694124460220326 -0.76494011282920826 -1.0361061692237852 -0.4692212045192718 -0.89694124460220326 -0.42596523463726038 -0.65161538124084462 -0.89694124460220326 -0.74281239509582508 -0.061670070514082902 -0.24806005507707593 0.61703985929489147 1.3470779061317446 -0.15848610550165174 1.7195086479187014 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 5 -2 24 6 7 15 32 11 25 12 13 14 30 28 -15 43 19 21 36 -10 -19 47 -5 41 -16 35 -3 58 -6 -26 -8 59 -28 -17 -21 38 -38 -13 -41 -4 -23 44 45 46 -18 49 -49 -7 51 -37 53 -53 55 -20 57 -31 -22 -24 61 -61 -right_child=3 2 10 4 9 23 8 -9 18 -11 -12 39 -14 16 26 27 17 22 54 20 29 42 33 -25 31 -27 34 -29 -30 56 -32 -33 -34 -35 -36 50 37 -39 -40 40 -42 -43 -44 -45 -46 -47 -48 48 -50 -51 -52 52 -54 -55 -56 -57 -58 -59 -60 60 -62 -63 -leaf_value=0.19805928014770968 0.19901117155329887 -0.085076952616401261 -0.12944502605285924 0.138009076864544 0.04663318665942786 -0.025806086315919689 0.13764877760694325 0.19924365571537453 -0.14206573169945311 -0.13310934489044368 -0.18299360380332019 0.17346739334364203 0.16636924691480301 0.14150692483697 -0.17037666354244596 0.047123245477189768 -0.080394403337580758 0.17752566155636781 -0.15619682235010779 -0.010627340971723158 -0.088928438824564152 -0.12964529198344107 -0.025789322600074025 -0.081862927805255448 0.1265163699834497 -0.030318979602100095 0.19055482924674758 0.17056981546264433 0.17439395187310364 -0.092672687004467749 -0.093022092633165951 -0.097567259373077753 0.020014670086876078 -0.19313736682877225 0.12992031844910112 -0.11877410026567586 -0.048946368392233104 -0.01971620253667293 -0.12774548231802227 -0.14875640651188576 0.13941831750685341 -0.15906830282183188 0.066055383803699366 -0.16587268864228294 0.1514560365508133 -0.077699915400679401 0.10047063191841221 -0.04645175245679866 0.043203885787840506 0.14160172401081403 -0.10914534127825569 0.012302805394164038 -0.09721941980439297 -0.055966839847249338 -0.14502361303573838 0.12109794637759437 -0.087261058925342833 -0.15575106016904441 -0.17795226552411064 0.13961817688871409 0.014887187106304696 -0.03172751419441297 -leaf_weight=2195.2496074587107 18065.746769592166 71.749987170100212 6247.2488829046488 101.24998189508915 311.24994434416294 125.74997751414776 272.74995122849941 645.49988457560539 3165.7494339197874 203.9999635219574 10837.24806214869 2777.7495032995939 1779.4996818006039 649.49988386034966 39.499992936849594 182.7499673217535 110.49998024106026 308.49994483590126 96.499982744455338 1879.2496639639139 302.24994595348835 78.749985918402672 379.49993214011192 424.74992404878139 57.749989673495293 331.49994072318077 955.24982918798923 69.749987527728081 249.49995538592339 1218.499782115221 274.74995087087154 791.7498584240675 635.49988636374474 52.499990612268448 2264.7495950311422 706.24987371265888 1726.4996912777424 1437.2497429996729 801.24985672533512 74.999986588954926 686.99987715482712 4567.2491833120584 229.74995891749859 128.99997693300247 112.99997979402542 752.49986544251442 162.9999708533287 452.99991899728775 1434.2497435361147 343.49993857741356 1048.9998124241829 899.74983911216259 621.99988877773285 861.24984599649906 2836.7494927495718 75.499986499547958 3940.2492954283953 1168.7497910112143 916.49983611702919 333.74994032084942 504.99990969896317 95.249982967972755 -leaf_count=8781 72263 287 24989 405 1245 503 1091 2582 12663 816 43349 11111 7118 2598 158 731 442 1234 386 7517 1209 315 1518 1699 231 1326 3821 279 998 4874 1099 3167 2542 210 9059 2825 6906 5749 3205 300 2748 18269 919 516 452 3010 652 1812 5737 1374 4196 3599 2488 3445 11347 302 15761 4675 3666 1335 2020 381 -internal_value=-0.000840423 -0.0946772 -0.107302 0.154978 0.0977952 -0.0668617 -0.0759479 -0.0204297 -0.0902604 0.109287 -0.160503 0.113164 0.0951642 0.0781277 0.119228 -0.0505229 0.0341267 0.0104246 -0.0969146 -0.0894592 -0.079454 -0.128021 0.0513189 0.0185335 -0.0588684 -0.138635 0.144051 -0.062739 0.116442 -0.109824 -0.0188451 -0.0823338 0.0553405 0.0228161 0.147908 -0.0665051 -0.0402396 -0.0542747 -0.0739242 0.160032 0.111055 -0.141956 0.0160994 -0.0435797 -0.0297292 -0.0496843 0.0273972 0.0366296 0.0216837 0.0967396 -0.071523 -0.0587478 -0.0409578 -0.0210857 -0.138704 -0.0344773 -0.100954 -0.123555 -0.155874 0.0314477 0.054704 0.101575 -internal_weight=0 53099 50903.7 31977.2 13911.5 28920.5 26139.2 5357.5 20781.7 12960.7 21983.2 12756.7 9217 7437.5 3845.5 4712 3592 2942.5 19873.5 16864.7 13390.5 3474.25 1674.5 2781.25 950.75 11146 3259.5 4390.75 321.25 7546.25 586 849.5 908.25 1366 3220 4321 5844.25 3965 2527.75 3539.75 762 10814.5 308.5 1268 1139 1026 273.5 2356.5 1887.25 469.25 4138.25 3089.25 2383 1761 3008.75 172 6327.5 2387.25 1218.75 1313.5 934 429 -internal_count=340305 212396 203615 127909 55646 115682 104557 21430 83127 51843 87933 51027 36868 29750 15382 18848 14368 11770 79494 67459 53562 13897 6698 11125 3803 44584 13038 17563 1285 30185 2344 3398 3633 5464 12880 17284 23377 15860 10111 14159 3048 43258 1234 5072 4556 4104 1094 9426 7549 1877 16553 12357 9532 7044 12035 688 25310 9549 4875 5254 3736 1716 -is_linear=0 -shrinkage=1 - - -Tree=1 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 1 7 8 7 8 8 3 0 7 7 1 2 8 2 2 3 0 2 7 2 0 3 3 2 7 2 8 8 8 8 1 3 8 7 8 1 7 7 3 0 8 0 7 3 2 3 2 2 8 8 8 8 8 8 8 8 8 8 -split_gain=101488 16005.7 8997.7 6672.45 2041.47 1825.36 1688.73 1511.67 1177.7 1779.78 993.418 904.656 886.32 925.424 1077.38 1031.52 781.635 754.597 487.968 498.703 1212.53 547.438 484.808 409.928 401.891 448.455 357.059 339.244 328.886 313.997 306.559 264.967 615.305 301.337 219.566 219.517 217.57 213.158 212.304 264.182 210.784 681.213 210.674 207.413 206.504 204.763 209.079 283.999 202.496 193.592 356.652 192.628 191.657 457.653 188.749 173.693 173.305 165.385 459.197 286.953 163.902 232.708 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.33242557942867274 0.41995026171207434 0.47061920166015631 -0.29721513390541071 -0.84467342495918263 -0.3375259786844253 -0.96778246760368336 -0.96772852540016163 5.5711138248443612 0.077961251139640822 0.1215791590511799 -0.38312204182147974 1.7005990743637087 1.0000000180025095e-35 -1.6652725934982298 -0.42596523463726038 -0.15682714059948918 -0.89694124460220326 -0.60601681470870961 -0.42596523463726038 0.084623351693153395 -0.24653984606266019 0.8987349569797517 0.8190734088420869 0.42307002842426306 -0.33625254034996027 2.3508639335632329 0.74029985070228588 -1.6671294569969175 0.93881765007972728 0.98129758238792431 0.74979063868522655 -0.29811820387840265 0.8190734088420869 1.0396918058395388 2.9366999864578252 1.0342026352882387 -0.79011020064353932 2.0183315277099614 2.2919597625732426 -0.89694124460220326 -0.74281239509582508 -0.77818816900253285 -0.49202047288417811 -0.36696811020374293 -0.89694124460220326 -0.42596523463726038 2.0070835351943974 -0.24653984606266019 -0.15682714059948918 1.1510444879531863 1.1580279469490053 -1.0620006322860716 -1.1708780527114866 0.75507655739784252 -1.3048880100250242 -1.275070786476135 -0.41330277919769282 1.5570636391639712 1.6632806658744814 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 5 -2 26 6 8 42 16 -10 12 28 13 27 15 17 30 35 19 21 36 -9 24 37 44 -26 -5 -6 51 45 -3 33 -33 -22 -28 -15 38 40 39 -21 -7 -42 -8 -23 -17 46 -18 -48 54 -19 -51 -4 -14 -54 -49 -12 -35 -27 -59 -60 -29 -62 -right_child=3 2 11 4 10 23 7 18 9 -11 55 -13 52 14 -16 22 29 49 -20 20 31 43 -24 -25 25 57 34 60 -30 -31 -32 32 -34 56 -36 -37 -38 -39 -40 -41 41 -43 -44 -45 -46 -47 47 48 -50 50 -52 -53 53 -55 -56 -57 -58 58 59 -61 61 -63 -leaf_value=0.18083608790820491 0.18171927635846019 -0.075955263610795881 -0.11624776919909861 0.12558247221988914 0.02072882850883169 0.058922470745686122 0.12452738797459083 -0.12796403958479116 0.18193501657796873 -0.050363708996489912 0.17517608154111994 -0.1653654220146219 0.15807766994533878 0.039848161105754648 0.140304120944857 0.054747439381765475 0.070285533115284335 0.12527852458153518 -0.12554664736420049 -0.0063685356539725417 -0.097972946463430188 -0.049025172895943374 -0.1825406144215653 -0.073047055545639214 0.15698367003308727 0.1243586853948101 0.11509728680610354 0.16273278109479505 -0.026537025869847859 0.15543552426209212 0.15895747839520835 0.11768294869693402 -0.084509686781009619 -0.18305729899751899 -0.087269228415489286 -0.09886585750669076 -0.0030986887351951171 0.12504147653664774 -0.18146438122226649 -0.061417957988141397 -0.10873444426002715 0.051137683516046709 0.016014442677508043 0.12440805126274897 -0.052145853587451434 -0.09359327571650089 -0.10669698635914833 0.0023779629960791909 -0.085344194377350385 0.1796521037303121 -0.14715231979184962 -0.1433414396443895 -0.13388969839753012 0.12696521034520822 -0.077612685292688555 -0.15116230008885062 -0.10162769826765872 -0.18666556865731632 0.12128076650538006 -0.0027678869944233312 -0.16346149176810348 0.18169722318711726 -leaf_weight=2173.8614719212055 17888.048495456576 71.620323926210403 6221.1531631350517 100.7694111764431 202.10022965073586 948.04581210017204 255.53765732049942 3044.6121025085449 639.13588362932205 681.49052357673645 17.920503616333008 10747.028164669871 2757.4559127688408 249.36440294981003 1634.0628312826157 225.99680590629578 148.66745576262474 2017.9725514054298 2874.6481968462467 1251.4646418690681 4080.3014157414436 189.56115382909775 131.1437936425209 424.03917254507542 302.69238193333149 137.04243148863316 57.519522741436958 1876.7626656293869 331.42382928729057 69.24512243270874 247.61254200339317 163.40432684123516 1907.0265004336834 774.13954283297062 789.8687390089035 210.29475219547749 1406.8961688280106 230.7647725045681 116.7435836493969 2873.2286886423826 408.62795673310757 766.41098581254482 596.69024395942688 108.38172960281372 902.27368168532848 1235.9042687863111 702.76856251060963 1274.5480503439903 567.40819849073887 61.241209924221039 73.439662307500839 4538.4805338829756 74.586617946624756 684.41909855604172 383.82365393638611 181.4450991153717 394.58229500055313 109.18956805765629 252.58212339878082 712.51802957057953 37.779791474342346 40.44393789768219 -leaf_count=8781 72263 287 24989 405 812 3797 1027 12240 2582 2733 72 43349 11113 998 6577 906 595 8106 11559 5006 16374 760 527 1699 1220 550 231 7562 1326 279 998 655 7646 3119 3167 843 5628 924 467 11510 1636 3069 2387 434 3619 4957 2821 5100 2275 246 295 18269 300 2751 1536 729 1582 437 1014 2851 152 163 -internal_value=0 -0.0848694 -0.0962675 0.141381 0.0892479 -0.0595987 -0.067813 -0.0817689 -0.0213789 0.0620606 0.0996629 -0.144689 0.103142 0.0867844 0.0691811 0.0476021 -0.0448161 0.0926955 -0.087558 -0.0808626 -0.071985 -0.115304 0.00512839 0.0174601 0.0144429 0.0481098 -0.0523711 0.144071 -0.124654 -0.0558208 0.106254 -0.0988467 -0.0685521 -0.110796 -0.0735329 -0.0236137 -0.037176 0.0337647 -0.0484795 -0.0447155 0.0238435 -0.00445896 0.0485517 0.0140641 -0.0307347 -0.0592124 -0.045404 -0.051277 -0.0337786 0.117531 0.00145006 -0.127676 0.145829 0.101331 -0.0161356 -0.121829 -0.155565 0.020904 0.00770672 0.0296976 0.156821 0.0149956 -internal_weight=0 52849.3 50675.4 31728.9 13840.9 28837.3 26059.4 20037.2 6022.22 1320.63 12892.7 21838.1 12693.4 9176.9 7019.81 5385.75 4701.6 2612.31 19185 16310.3 12967.8 3342.55 2773.44 2777.89 2642.3 1514.02 948.158 2157.09 11091.1 4382.37 319.233 7319.45 2070.43 5249.02 847.388 459.659 5648.33 2353.85 4241.44 4124.69 2123.08 1175.04 852.228 297.943 1128.27 4313.12 3077.22 2928.55 2225.78 2152.65 134.681 10759.6 3516.46 759.006 1658.37 199.366 1168.72 1211.33 1074.29 965.1 1954.99 78.2237 -internal_count=340305 212396 203615 127909 55646 115682 104557 80394 24163 5315 51843 87933 51042 36878 28189 21612 18848 10488 76980 65421 51987 13434 11124 11125 10597 6072 3803 8689 44584 17563 1285 29376 8301 21075 3398 1841 22611 9426 16983 16516 8502 4705 3414 1194 4525 17284 12327 11732 8911 8647 541 43258 14164 3051 6636 801 4701 4852 4302 3865 7877 315 -is_linear=0 -shrinkage=0.1 - - -Tree=2 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 1 7 8 7 8 8 3 0 7 1 2 2 8 7 0 2 3 0 2 3 2 7 3 7 2 8 1 7 7 3 7 2 1 8 8 8 8 3 7 3 3 1 7 3 0 0 7 8 8 8 8 3 0 8 8 8 3 3 -split_gain=83804.5 13209.2 7571.5 5749.73 1680.69 1494.7 1381.62 1236.52 966.93 1338.93 817.69 806.787 747.566 790.298 927.108 903.421 647.923 629.234 517.97 409.419 419.371 1004.31 453.035 403.621 333.275 310.943 292.037 262.627 260.348 254.236 221.639 219.172 235.197 195.964 192.726 181.553 179.263 177.051 179.247 271.437 216.529 174.27 174.186 204.337 241.031 198.011 172.288 191.311 292.628 171.248 170.951 169.441 353.857 178.661 167.628 166.799 264.107 164.661 187.509 164.533 180.135 184.939 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.33242557942867274 0.41995026171207434 0.47061920166015631 -0.29721513390541071 -0.84467342495918263 -0.3375259786844253 -0.98042774200439442 -0.96772852540016163 5.4391126632690439 -0.013235819933470337 0.1215791590511799 1.8850960135459902 1.0000000180025095e-35 -0.42596523463726038 -1.6652725934982298 -1.3274887800216673 0.9215342402458192 -0.15682714059948918 -0.89694124460220326 -0.60601681470870961 -0.42596523463726038 0.8190734088420869 -0.24653984606266019 -1.921579837799072 0.42307002842426306 2.3508639335632329 0.74029985070228588 -1.6671294569969175 -0.574863702058792 -1.1447212696075437 -0.36696811020374293 0.8190734088420869 -0.38312204182147974 -0.24653984606266019 -0.29811820387840265 1.0396918058395388 0.20498233288526538 0.44872735440731054 1.0342026352882387 -0.89694124460220326 2.9366999864578252 -0.89694124460220326 -0.76494011282920826 -1.0361061692237852 -0.36696811020374293 -0.89694124460220326 -0.65161538124084462 -0.87960800528526295 0.084623351693153395 1.5570636391639712 1.0512721538543703 -0.65551894903182972 0.38476034998893743 -0.89694124460220326 -0.74281239509582508 -0.061670070514082902 -0.24806005507707593 0.50243258476257335 2.0070835351943974 2.1390846967697148 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 5 -2 26 6 8 41 16 -10 12 27 13 14 15 33 29 -16 51 20 22 37 -9 -20 42 -17 -5 -4 35 -3 -23 -25 -33 -6 -27 46 -28 38 -22 54 -41 -8 43 45 -45 -7 -18 -48 -49 -24 -34 52 53 -19 -40 56 -21 58 -32 60 -36 -62 -right_child=3 2 11 4 10 24 7 19 9 -11 -12 -13 -14 -15 17 25 28 18 23 55 21 30 49 31 -26 34 36 -29 -30 -31 57 32 50 -35 59 -37 -38 -39 39 40 -42 -43 -44 44 -46 -47 47 48 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 61 -63 -leaf_value=0.16723433413851799 0.16809519524641148 -0.068695468219386643 -0.11322872408089223 0.11452433542780267 0.039108971798844677 -0.028773294212034402 0.11354310429990609 -0.11676610937811754 0.15205536455970867 -0.045452758729842273 -0.10943303554728206 -0.14945764583773044 0.13380647198758647 0.13976478610192336 0.16219219459596487 -0.15927174757749421 0.057266001180909604 0.019377987465908374 0.14931893177635525 -0.13035302469759361 -0.022712382050360684 -0.12947963363763962 -0.11891025588278414 -0.22116437176545256 -0.066041058479979639 0.16192926461359339 0.10474210045098636 -0.013180554270390298 0.1427376706959714 0.14610537283328745 -0.072949834034148883 0.14214416332467536 0.025945128449897971 -0.077046279729879749 0.09837392688649127 -0.10269321311606031 -0.079051017567900006 -0.0027889072918616405 -0.060108806108376635 -0.021458593561320235 -0.1649387930911814 0.014416398830558612 0.11323287494388215 -0.061792829779412806 0.026938562378522907 0.12450614208454475 -0.098476730233181944 0.0042275575085326134 -0.063570402144303928 0.056532709075636511 -0.16118199858713744 -0.13928924582023586 0.14376855994536364 -0.066971961868710531 -0.14744091266792378 -0.12057607059461023 0.12177880102239956 -0.069826220492134147 -0.13185703652269173 0.14451650939096267 -0.12320447874808221 0.13149919063559187 -leaf_weight=2118.3090286552906 17426.564277112484 71.286857724189758 8967.776243403554 99.511439725756645 310.69709046185017 115.35888746380806 252.38804551959038 3004.8900675773621 693.35797129571438 679.70128865540028 200.35662764310837 12226.488481655717 3451.270025998354 1737.3315938264132 318.49606114625931 39.34087198972702 155.523048132658 356.38085828721523 328.42495773732662 93.139401078224182 2070.6048350334167 1178.0475510507822 74.156596809625626 34.690302416682243 422.2119634449482 927.07326392829418 56.915312066674232 270.28151646256447 67.92908987402916 242.69507437944412 1019.2109444737434 174.95609797537327 1306.1909636706114 272.74817796051502 1321.5508254766464 661.9484960436821 785.02589827775955 1406.8565304279327 413.43188782036304 1162.8420498669147 115.63967365026474 596.55639754235744 229.40589046478271 401.42628759145737 1289.7013752311468 312.82033811509609 665.02388845384121 1005.7785769253969 1734.5071271359921 222.76356843113899 50.715662509202957 134.23258906602859 117.81982812285423 731.285221606493 469.24062830209732 2670.574012607336 74.999628961086273 3909.0908323675394 1150.1401755958796 785.15398117899895 76.733392328023911 45.358642026782036 -leaf_count=8781 72263 287 36529 405 1245 462 1027 12240 2867 2733 816 50322 14159 7118 1303 158 624 1435 1350 380 8300 4826 299 142 1699 3821 231 1082 279 998 4114 715 5254 1099 5373 2672 3167 5628 1660 4665 467 2387 924 1608 5170 1262 2694 4028 6981 895 210 543 473 2943 1891 10879 300 15761 4675 3193 308 185 -internal_value=0 -0.077362 -0.0877044 0.130017 0.0814472 -0.0540449 -0.0615235 -0.0741963 -0.0193684 0.0542835 0.0909888 -0.132605 0.0941961 0.0791223 0.0647527 0.0992352 -0.0413305 0.0280821 0.0148773 -0.0794625 -0.0733228 -0.0651806 -0.105122 0.0485239 0.0157562 0.120129 -0.047479 -0.110302 -0.0514777 0.0973367 -0.0897811 0.0273924 0.0330211 -0.0151911 0.123612 -0.054602 -0.0666266 -0.0335214 -0.0437385 -0.0638837 -0.0344365 0.0438864 0.0304603 0.0215005 0.00587622 0.08321 -0.0456619 -0.0503627 -0.0386862 0.0127154 0.0189511 -0.0327144 -0.0208471 -0.0386788 -0.106536 -0.114494 -0.0178878 -0.0820872 -0.104181 0.107674 0.0876372 -0.028579 -internal_weight=0 52216.1 50097.8 31088.8 13662.3 28633.2 25862.3 19884.6 5977.75 1373.06 12720.8 21464.5 12520.5 9069.18 7331.85 3778.66 4604.69 3553.19 3234.7 19035.6 16196.9 12895.1 3301.81 1894.98 2770.92 3195.21 941.453 9238.06 4290.71 313.982 7256.49 1566.55 1531.86 583.445 3155.87 4222.78 841.941 5638.62 4231.76 2161.15 1278.48 848.944 2348.71 2119.31 1691.13 428.179 3560.83 3405.31 2740.29 296.92 1356.91 1339.72 1205.49 1087.67 882.673 2838.71 168.139 6078.44 2169.35 2228.8 1443.64 122.092 -internal_count=340305 212396 203615 127909 55646 115682 104557 80394 24163 5600 51843 87933 51027 36868 29750 15382 18563 14368 13065 76980 65421 51987 13434 7671 11125 13038 3803 37611 17278 1285 29376 6321 6179 2344 12880 16999 3398 22611 16983 8683 5132 3414 9426 8502 6778 1724 14327 13703 11009 1194 5464 5394 4851 4378 3551 11559 680 24550 8789 9059 5866 493 -is_linear=0 -shrinkage=0.1 - - -Tree=3 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 1 7 8 8 7 7 7 8 0 1 2 2 3 2 3 0 2 8 3 3 7 8 8 2 2 8 2 7 8 3 8 8 8 0 0 0 0 5 7 7 8 2 1 3 8 8 7 2 1 3 3 7 7 3 7 0 3 0 -split_gain=69812.6 10997.9 6486.66 5103.7 1397.19 1352.17 1153.97 2096.34 1194.68 661.555 703.421 583.974 583.823 557.434 676.366 577.533 459.3 456.008 354.708 998.608 410.168 348.204 308.175 272.462 267.154 263.974 316.587 258.458 251.572 248.958 247.878 227.977 225.311 221.627 207.176 495.517 237.992 206.332 196.651 196.003 194.981 194.583 181.321 180.344 172.607 169.748 192.51 190.292 167.23 376.982 166.974 166.842 215.403 165.73 161.971 157.988 171.315 157.788 275.44 157.311 156.762 229.223 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.28682704269886011 0.41995026171207434 0.56286767125129711 -0.29721513390541071 -0.96772852540016163 -0.96778246760368336 -0.3375259786844253 0.1215791590511799 -0.38312204182147974 -1.6652725934982298 0.12355979532003404 1.8850960135459902 1.0000000180025095e-35 -0.42596523463726038 4.779107093811036 -0.15682714059948918 -0.89694124460220326 -0.60601681470870961 -0.42596523463726038 -0.42785568535327906 0.55507114529609691 -0.89694124460220326 0.084623351693153395 1.5684867501258852 1.168141305446625 0.56087446212768566 -0.33625254034996027 0.70141646265983593 -0.24653984606266019 2.3508639335632329 -1.6671294569969175 2.0070835351943974 0.93881765007972728 0.98129758238792431 0.74979063868522655 0.69354149699211132 -0.99360433220863331 0.85313642024993908 -0.56041827797889698 1.0000000180025095e-35 2.9366999864578252 2.0636492967605595 -1.2859565019607542 1.0000000180025095e-35 -0.23661921173334119 0.29106890410184866 1.1510444879531863 1.1580279469490053 -1.3274887800216673 -0.24653984606266019 -0.72861120104789723 -0.76494011282920826 0.8190734088420869 -1.8801200985908506 -1.9507447481155393 -0.89694124460220326 -0.3375259786844253 -0.74281239509582508 -0.89694124460220326 -0.74281239509582508 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 5 -2 21 6 7 11 55 10 28 32 31 14 15 54 -15 18 20 51 -10 33 37 38 25 26 29 40 -6 39 42 -4 -3 -5 36 -36 -21 -16 -7 -24 57 -31 53 -32 -42 -23 -47 -48 -11 -50 -49 52 -20 -25 -12 56 -8 58 -13 -22 61 -19 -right_child=3 2 12 4 9 23 8 -9 17 48 13 27 -14 16 22 -17 -18 60 19 34 59 45 24 30 -26 -27 -28 -29 -30 41 43 -33 -34 -35 35 -37 -38 -39 -40 -41 44 -43 -44 -45 -46 46 47 50 49 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -leaf_value=0.15664380274910772 0.15751234303479497 -0.061688899279464797 -0.11120207782841147 0.080412892220072515 0.016942711920389684 -0.026468026764681202 0.15790054049511298 0.15867044919168416 -0.10736294612851849 0.13626626982467335 0.032184656921911262 0.14101809229248097 -0.14328926184070509 0.12783024872429027 -0.043904656092550343 0.099618742402896476 -0.1054943602406898 -0.11995037249556556 -0.0048547576196411799 -0.082632299124944755 -0.039754131264035819 0.10509255225686878 -0.039088514096494893 -0.051290409984773515 -0.16294192925526046 -0.11014608578780474 0.14495148382098108 0.11260891477495755 0.13518914904763193 -0.094689418159931815 0.050432374866821332 -0.02291606605736575 0.13694405765160172 -0.088825696776953178 0.1069724098482786 -0.07043514699953754 -0.13305444104009914 0.14787376208694294 0.11851056920945524 0.040490488533566023 -0.1032020192647275 0.14486244780077981 0.10541344526048486 -0.10298152758266675 -0.043904410490651614 0.11317768993113933 0.12714759919103574 0.20592811210805292 -0.13277343758456117 0.10609093196067583 -0.095919554852406691 -0.11100882558381392 -0.044167799183237925 0.021423788322839029 -0.08538427985096346 0.059643767889692369 -0.11605172932625339 -0.017721086099552138 -0.090072309948525886 0.10954047202579763 -0.11265738188920026 0.11364530076981932 -leaf_weight=2039.3684519827366 16770.845326930285 74.185600027441978 9516.6851920932531 414.20368285477161 199.79536873102188 125.670992359519 29.063650235533714 601.70490670204163 3055.7576635032892 2557.700292751193 250.74024412035942 55.815912067890167 9470.5318138003349 1288.7139418125153 74.292767599225044 2133.1555314511061 90.277735814452171 92.554598182439804 2718.2338628172874 4692.1781844496727 195.91224794089794 62.764342859387398 430.0567165017128 406.73953475058079 103.86060935258865 195.21632482111454 163.71087929606438 98.075860306620598 1808.9075336903334 350.24827030301094 99.025107458233833 301.76146684587002 248.03610204160213 95.155754044651985 170.41654498875141 2067.5610176622868 1169.391344204545 229.10179027915001 366.15377652645111 1104.1097676157951 1037.9191492348909 37.542812541127205 199.00102593004704 338.74244569242001 931.40614216029644 56.661865517497063 43.112418204545975 18.777655363082886 73.543295323848724 650.38880287110806 762.38494223356247 235.10816697776318 2860.3030766844749 1366.5643810778856 219.98634506762028 789.11403004825115 106.37220320105553 1840.9877350330353 679.28778313100338 110.32195620238781 3151.0979270190001 76.918935880064964 -leaf_count=8781 72263 301 39613 1699 812 504 119 2582 12676 10793 1007 225 40278 5416 301 8824 375 386 10913 19217 791 263 1734 1633 434 797 662 411 7649 1416 403 1216 1049 388 683 8418 4882 959 1499 4471 4229 151 809 1373 3794 233 178 77 296 2707 3109 946 11518 5490 895 3192 428 7402 2788 448 13090 309 -internal_value=0 -0.0709186 -0.0803362 0.120737 0.0747607 -0.051239 -0.058189 -0.0176094 -0.0686872 0.0860092 0.0715486 -0.0389695 -0.125575 0.0559273 0.0411713 0.0774954 0.112555 -0.0736792 -0.0671556 -0.0601117 -0.0963056 -0.0178349 0.00598567 0.0136356 -0.00609123 0.00105101 0.0114589 -0.0480031 0.123428 8.81417e-05 -0.000206471 -0.108489 0.0912125 0.0487967 -0.0828092 -0.056926 -0.0926916 0.100913 0.0814657 0.0181829 -0.0514686 -0.0714979 0.0149026 -0.0682786 -0.0751568 -0.053799 -0.0651196 -0.077376 0.124256 0.081825 -0.0886637 -0.0284897 -0.0250119 0.00474546 -0.0227593 0.0425182 -0.0572634 -0.0333599 -0.0725258 0.0140298 -0.107619 -0.0139283 -internal_weight=0 51318 49278.6 30185.3 13414.4 29989.6 27087.7 5567.42 21520.3 11961.3 8679.72 4965.71 19289 6671.01 5292.02 2603.88 1378.99 20595.8 17275.2 13913.2 3361.99 1453.06 2688.14 2901.9 2384.75 2280.88 2085.67 4643.49 2008.7 1921.96 2410.07 9818.45 322.222 509.359 8099.55 2237.98 5861.57 303.395 491.825 1534.17 4545.42 387.791 1972.3 437.768 1969.33 943.701 880.937 824.275 3281.63 723.932 781.163 5813.65 5578.54 1773.3 470.727 924.55 135.436 2576.09 735.104 306.234 3320.57 169.474 -internal_count=340305 212396 203615 127909 55646 122508 110797 22781 88016 49699 35903 20199 81107 27442 21651 10726 5791 84277 70492 56577 13915 5947 10925 11711 9665 9231 8434 18849 8461 7772 9708 40829 1350 2087 33200 9101 24099 1260 2003 6205 18438 1567 7932 1776 8023 3860 3597 3364 13796 3003 3186 23377 22431 7123 1902 3739 547 10415 3013 1239 13785 695 -is_linear=0 -shrinkage=0.1 - - -Tree=4 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 1 7 8 7 8 8 7 7 0 1 2 8 2 3 2 7 8 8 8 3 0 2 7 7 7 2 3 8 2 7 8 8 8 2 7 8 8 7 7 7 2 8 2 0 3 3 8 8 3 2 8 8 3 3 2 8 8 8 -split_gain=58535 9219.84 5632.4 4635.33 1181.99 1116.54 958.259 883.612 658.372 1144.29 602.477 665.087 550.179 522.412 653.715 481.838 418.718 405.645 375.225 298.882 318.927 307.522 285.61 280.49 763.01 340.542 237.063 1056.86 227.033 211.807 210.777 190.008 188.628 186.107 180.852 404.506 206.94 180.544 173.562 234.752 339.916 167.392 167.037 572.089 288.832 161.484 160.898 178.684 153.984 210.903 148.846 147.311 145.946 281.538 144.782 154.392 140.879 139.741 139.026 136.784 135.435 166.367 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.28682704269886011 0.41995026171207434 0.47061920166015631 -0.29721513390541071 -0.84467342495918263 -0.3375259786844253 -0.96778246760368336 -0.96772852540016163 0.1215791590511799 -0.38312204182147974 0.10076052322983743 1.6391000747680666 1.0000000180025095e-35 -1.6652725934982298 -0.42596523463726038 4.779107093811036 -0.15682714059948918 0.084623351693153395 1.5684867501258852 1.1580279469490053 -0.15848610550165174 -0.89694124460220326 -0.60601681470870961 -0.24653984606266019 2.0183315277099614 2.2919597625732426 -1.921579837799072 0.56087446212768566 0.42307002842426306 -1.6671294569969175 -0.067114436998963342 2.3508639335632329 0.93881765007972728 0.98129758238792431 0.74979063868522655 -0.33625254034996027 2.8198474645614628 1.1510444879531863 1.1580279469490053 2.7036424875259404 -1.3274887800216673 -1.15643835067749 -0.33625254034996027 0.75507655739784252 0.11231096461415292 0.83033716678619396 0.8190734088420869 1.3470779061317446 0.9683690369129182 0.097422134131193175 2.0070835351943974 -0.24653984606266019 1.0396918058395388 -0.24806005507707593 -0.89694124460220326 -0.89694124460220326 -0.15682714059948918 0.9683690369129182 1.0033385753631594 1.4775097370147707 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 5 -2 30 6 8 56 15 -10 11 28 33 14 16 31 -13 37 23 20 21 22 46 25 54 -9 -7 -28 -6 32 -5 -3 -17 58 36 59 -26 -15 39 -12 -41 -32 -30 51 -45 -38 -16 -48 -49 -50 -43 -44 -18 -54 55 -25 -8 -29 -4 -36 -37 -62 -right_child=3 2 12 4 10 26 7 18 9 -11 38 13 -14 17 19 29 52 -19 -20 -21 -22 -23 -24 24 34 -27 27 57 42 -31 41 -33 -34 -35 35 60 45 -39 -40 40 -42 50 43 44 -46 -47 47 48 49 -51 -52 -53 53 -55 -56 -57 -58 -59 -60 -61 61 -63 -leaf_value=0.14818426018202077 0.14908207968797435 -0.056001604604358247 -0.09107938986493283 0.099773897472549111 -0.098089572620234669 0.049416946037057152 0.099598206950204216 -0.094997870534035467 0.1500621713349628 -0.039270572773285245 0.11545948629672978 -0.017199462369006796 -0.13385245394349626 -0.058304209976345722 0.13481035250829956 -0.040563908224756813 0.096781825936605684 -0.099688570446472891 -0.09976041290405839 -0.15341911501765268 -0.10584294173002298 0.12975115926427955 -0.031083383330676936 -0.0018772875303351795 -0.075740544477610527 0.079220734270035328 -0.11813171243052989 0.1237966691682989 0.14963890067563337 0.1028628569268715 -0.06941919218064968 0.12770077567142113 -0.11473427927052469 -0.020797231947560935 -0.044521303805616745 -0.15579654536514065 -0.15267167231934853 0.11526472420697442 0.14409441501362041 -0.17535392693355104 0.068696526110865741 0.20527955400642184 -0.22962399675445738 -0.049163317165032638 0.13221259281661837 -0.074599337067346477 -0.043157305867122825 0.14382048678291418 -0.058873847260099234 0.074945413937081984 -0.13016024483402883 0.15825623525648758 0.15239140232740628 -0.14877074587109732 0.00031118510738506634 -0.047285446247914122 0.010017859362819473 0.029646443325005767 -0.11618262583669026 0.15325662608943794 -0.033186508330091152 -0.09390097122123571 -leaf_weight=1945.0419577807188 15987.374514937401 73.579776525497437 5030.5454336851835 96.240054205060005 49.06809438765049 1079.8054356276989 245.09292773902416 3060.914671421051 574.01532545685768 719.13880795240402 1578.0282243341208 446.4034079015255 9436.0856599509716 62.319677144289017 158.91259358823299 4136.4427901953459 1889.3673535734415 99.203468546271324 3111.456537887454 122.34958688914776 230.2920648008585 220.99852035939693 1060.1618119329214 974.74792142212391 4396.6282792389393 116.46620111167431 600.96416500210762 186.53141234815121 953.90299211442471 96.701954141259193 763.95576994121075 239.81515888869762 373.87395706772804 290.76625047624111 49.129657864570618 149.28567393124104 712.62586186826229 1562.3393216729164 1196.6424571871758 65.393022879958153 448.42781358957291 42.770065009593964 65.148609921336174 98.240400552749634 826.00405883789062 421.71172933280468 266.97476042807102 147.51611277461052 181.43369117379189 335.6534625440836 19.152030527591705 11.523141816258429 56.007576197385788 69.63471856713295 1405.8213812708855 3230.0410231947899 618.80135542154312 1017.9909987300634 3929.4092675000429 121.32038632035255 1156.9261836707592 739.99200142920017 -leaf_count=8781 72263 301 21307 405 200 4375 1028 12973 2582 2934 6884 1817 41742 263 644 16889 7982 419 13250 527 955 912 4310 3918 18275 477 2438 773 4174 411 3144 1049 1549 1171 197 614 3122 6709 5205 267 1910 175 267 398 3617 1760 1077 638 730 1369 79 50 240 279 5628 13065 2485 4125 16887 486 4758 3046 -internal_value=0 -0.0652792 -0.073876 0.112928 0.0688867 -0.0467467 -0.0531273 -0.0635496 -0.0187233 0.0447719 0.0771366 0.0636398 -0.116863 0.0489293 0.0330757 -0.0354107 0.0704286 0.09662 -0.0679019 -0.000672901 0.00650959 0.0174192 0.00587615 -0.0619072 -0.0555552 -0.0886119 0.0123521 -0.00981516 0.114363 -0.0435725 -0.0402811 0.0845706 -0.0467121 -0.0995333 -0.0758934 -0.0517594 -0.0855656 0.108607 0.11372 0.0963439 0.0376367 -0.0566018 0.119696 0.0911587 0.112934 -0.123647 0.0418078 0.025943 0.053701 0.0279914 0.101531 -0.171329 0.0898417 -0.0145217 -0.0274705 -0.036759 0.0354324 0.0442265 -0.102088 0.0962501 -0.0640887 -0.0568714 -internal_weight=0 50241.7 48296.6 29111.4 13124.1 29609.8 26724.5 20511 6213.57 1293.15 12201.9 8913.46 18686.8 6909.57 5185.71 4920.41 2461.41 1723.86 19647.1 2724.29 2601.94 2371.65 2150.65 16535.6 13358.2 3177.38 2885.29 1805.49 2003.89 4607.02 922.118 313.395 4510.32 9250.72 7747.62 2216.65 5530.97 1624.66 3288.49 2091.85 513.821 825.878 1954.82 1000.92 924.244 1134.34 1090.49 931.578 664.603 517.087 61.9221 76.6718 2015.01 125.642 5610.61 4204.79 863.894 1204.52 8959.95 170.45 2046.2 1896.92 -internal_count=340305 212396 203615 127909 55646 122508 110797 85082 25715 5516 51843 37577 81107 28871 21480 20199 10318 7391 81569 11162 10635 9680 8768 68319 54869 13450 11711 7336 8706 18849 3803 1350 18438 39365 32258 9101 23157 6972 14266 9061 2177 3398 8506 4332 4015 4882 4458 3814 2737 2099 254 317 8501 519 22611 16983 3513 4898 38194 683 8418 7804 -is_linear=0 -shrinkage=0.1 - - -Tree=5 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 1 7 8 0 7 3 2 2 1 0 3 8 7 7 8 8 2 8 3 0 2 7 0 3 7 2 7 2 3 1 8 8 1 7 7 3 1 8 8 8 3 8 2 1 8 8 8 7 2 3 7 3 3 0 1 7 1 7 -split_gain=49340.4 7770.16 4954.46 4267.7 995.459 833.196 791.084 660.113 645.522 520.802 541.309 709.166 958.601 488.39 849.57 398.047 336.371 295.314 288.724 285.749 268.823 258.621 243.726 568.909 290.417 234.636 232.164 318.559 206.074 205.365 172.517 172.315 162.369 160.972 149.109 152.126 144.886 143.378 143.064 142.723 142.21 139.183 137.165 218.672 142.001 153.59 135.728 159.64 171.803 271.502 147.61 135.319 133.949 139.873 135.457 133.618 126.022 185.389 126.887 125.678 121.782 121.437 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.33242557942867274 0.41995026171207434 0.56286767125129711 -0.29721513390541071 -0.7419527769088744 0.077961251139640822 -0.3375259786844253 6.3631205558776864 -0.33625254034996027 1.0000000180025095e-35 1.4546030759811404 -0.65161538124084462 -0.89694124460220326 -0.98042774200439442 -0.3375259786844253 -1.3274887800216673 -1.6652725934982298 -0.96772852540016163 -0.15682714059948918 -0.42785568535327906 -0.89694124460220326 -0.60601681470870961 -0.42596523463726038 -1.15643835067749 0.85313642024993908 0.8190734088420869 -0.27973499894142145 -0.24653984606266019 -0.27973499894142145 0.74029985070228588 2.0070835351943974 -0.574863702058792 -0.061670070514082902 -0.24806005507707593 -0.26736870408058161 2.8053728342056279 -0.38312204182147974 -0.76494011282920826 2.1618415117263798 -1.6652725934982298 0.50243258476257335 0.40141405165195471 2.0070835351943974 1.0000000180025095e-35 -0.24653984606266019 -0.82085970044136036 0.62498643994331371 0.38476034998893743 0.38478974997997289 -1.9059542417526243 1.0000000180025095e-35 0.29106890410184866 -1.3274887800216673 -0.76494011282920826 -0.89694124460220326 -0.7656116485595702 -1.0976051688194273 -1.9507447481155393 -0.94385769963264454 2.2919597625732426 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 5 -2 21 6 13 28 51 10 -6 30 17 14 16 -16 -3 -13 -15 -17 22 32 24 46 -10 -19 -27 -28 41 61 38 -20 -5 -25 35 -35 -23 -29 -12 -30 -14 -4 43 44 -40 -46 47 -24 49 -49 -51 59 -38 -54 -55 -31 57 -22 -58 -8 -26 -7 -right_child=3 2 7 4 9 29 8 -9 20 -11 11 12 40 18 15 19 -18 25 31 -21 56 36 23 33 60 26 27 37 39 55 -32 -33 -34 34 -36 -37 52 -39 42 -41 -42 -43 -44 -45 45 -47 -48 48 -50 50 -52 -53 53 54 -56 -57 58 -59 -60 -61 -62 -63 -leaf_value=0.141288528566273 0.14223278736877351 0.13122023291189328 0.058825316065816605 0.070054306340172387 0.0098643736764147741 0.0036953615290966782 0.13692892632952905 -0.12570831384601053 -0.091497381182442952 -0.10640455204247433 0.13397245103304761 0.1355854073979153 0.079270481303283069 0.079469627584227598 0.010798043465105321 0.14135677835066418 -0.086109327590759482 -0.18707234258811611 -0.053964820667834285 0.024204814897973041 -0.10940132151924836 0.1203575946197642 0.0084546277637876331 -0.10457537243937848 -0.10234541810182805 -0.018379056442616597 0.12252969077301619 0.011873367437791063 0.19953700162977861 0.076977092675485928 0.14173345950350499 0.11264629389389756 -0.07631874507562314 -0.052720341525981852 -0.050772213851614872 -0.10962750798239235 0.089004833802223571 0.12968462084269997 0.088373235531483629 -0.04826284017053728 0.12609835014385948 -0.094154675397623788 0.12116305133762231 -0.10500124365112949 0.079298023070163787 -0.12660458012948186 -0.10223461322681221 -0.026768373761224429 -0.0062456181253537973 -0.16673899479479812 -0.069583623166518102 0.047928141856510523 0.11254189312161172 0.19021240072468765 -0.082970549434084828 -0.078043229470130843 0.12891026587119156 0.10622700166369224 -0.09899640233336697 -0.11793296390961217 0.050725243450492057 0.050083269727464179 -leaf_weight=1841.3072956502438 15125.797825530171 77.760086253285408 59.852951258420944 405.79977439343929 1109.8915030509233 1337.2712012082338 24.194596797227859 9372.2322725653648 2716.3843482136726 158.47523558139801 838.13487027585506 222.5365644544363 1150.1822365522385 181.11128161847591 1271.9915442317724 561.49904498457909 846.29411788284779 55.731186240911484 2979.8074869811535 330.89835520088673 82.26373203098774 49.137541681528091 1277.321421161294 1068.2027494907379 68.194144859910011 1026.3071466684341 380.06280212104321 1435.8116060197353 23.937039256095886 66.196135014295578 1510.4914483726025 63.395279437303543 93.187654703855515 827.6096099615097 3785.2005001306534 1086.3872408866882 59.253414615988731 111.31028950214386 1010.6965289711952 801.66798567771912 1486.862499922514 9357.9775532335043 657.85454444587231 71.608842089772224 77.220550328493118 68.24398210644722 221.25918582081795 1668.4666852504015 1547.00639526546 227.57032646238804 499.89698281884193 683.41747125983238 41.298066169023514 18.600388169288635 750.925312936306 347.40760561823845 24.682985007762909 77.375256225466728 2372.0805684775114 96.601555705070496 218.54260501265526 976.37723089754581 -leaf_count=8781 72263 318 256 1699 4669 5433 104 43349 11836 675 3891 1019 5018 813 5159 2653 3566 224 12297 1338 363 205 5127 4774 283 4311 1692 6020 98 268 6798 284 388 3460 15761 4675 263 507 4365 3376 6697 40854 2885 309 340 279 903 6744 6218 949 2053 2800 178 77 3137 1431 108 312 10379 388 892 3993 -internal_value=0 -0.0602447 -0.0681019 0.106279 0.0637578 -0.0408031 -0.0465536 -0.10653 -0.0569483 0.0736103 0.0761545 0.0834369 0.0609328 -0.0159716 0.0124483 0.0467181 -0.0678208 0.0244252 -0.0431939 0.0979172 -0.0609146 -0.0152411 -0.0558722 -0.0495945 -0.0813939 0.0162048 0.0200405 0.0405006 -0.0889829 0.0116691 0.114628 -0.050494 0.0427186 -0.0689512 -0.0622742 -0.0850209 -0.046704 0.0203495 0.0995968 -0.0410783 0.105674 -0.0931824 0.0843172 0.0645748 0.0750777 -0.0173002 -0.0255213 -0.0222698 -0.0322231 -0.0489962 -0.0999763 0.0306826 -0.0561388 -0.0667456 -0.0763674 -0.0532327 -0.0909192 -0.00488865 -0.0966493 -0.066886 0.0143207 0.0232714 -internal_weight=0 49069.6 47228.3 27915.4 12789.6 27612.7 24885.4 19615.7 18572.7 11371.4 11212.9 10103.1 5868.8 6312.76 3088.44 2164.39 924.054 3231.76 3224.31 892.397 17768.4 1418.2 15212 12208.9 3003.12 3009.22 2953.49 1927.18 10243.4 2727.25 4234.25 3043.2 498.987 6767.4 5699.2 1914 919.215 1547.12 2723.76 825.605 2637.04 9417.83 1885.62 1227.77 1156.16 145.465 5441.52 5220.26 3942.94 2395.93 727.467 804.214 870.077 810.824 769.526 413.604 2556.4 159.639 2396.76 120.796 286.737 2313.65 -internal_count=340305 212396 203615 127909 55646 115682 104557 87933 78129 49699 49024 44355 25488 26428 13034 9150 3884 13773 13394 3991 74837 5947 63675 50664 13011 12754 12530 8219 44584 11125 18867 12581 2087 28670 23896 8135 3860 6527 12069 3474 11715 41110 8178 5293 4984 619 21994 21091 15964 9746 3002 3292 3655 3392 3214 1699 11162 675 10487 492 1175 9426 -is_linear=0 -shrinkage=0.1 - - -Tree=6 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 0 1 3 2 1 7 7 7 1 2 2 0 0 3 7 3 7 8 3 3 2 7 7 8 3 2 2 8 2 1 8 1 2 8 8 8 3 0 2 2 8 8 8 3 8 8 1 3 0 8 8 2 8 8 8 8 7 7 -split_gain=41716.1 6577.96 4398.11 4001.71 890.934 844.029 668.337 791.922 581.934 551.596 497.482 418.627 412.722 560.25 772.843 397.038 373.116 480.095 362.062 484.34 349.051 327.441 284.193 257.014 249.907 243.297 242.675 230.408 225.451 240.308 250.956 260.389 223.72 216.74 206.697 201.994 191.423 177.849 174.906 168.704 159.64 158 151.505 150.591 147.07 207.885 158.71 142.404 179.295 172.182 159.078 141.844 141.502 148.819 138.951 132.565 130.803 152.883 372.161 152.448 127.78 127.75 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.24122850596904752 0.41995026171207434 -0.60601681470870961 0.47061920166015631 -0.76494011282920826 -0.067114436998963342 -0.79011020064353932 -0.3375259786844253 2.8198474645614628 -1.3274887800216673 3.0228273868560795 1.0000000180025095e-35 -0.33625254034996027 0.14635906368494037 0.96713274717330944 4.1191017627716073 -0.29721513390541071 -0.89694124460220326 -0.27973499894142145 -1.6652725934982298 0.8190734088420869 -0.89694124460220326 -0.24653984606266019 -1.921579837799072 -1.0192310214042661 -0.96772852540016163 -0.89694124460220326 -0.42596523463726038 0.4711617529392243 -0.73045688867568959 -0.42596523463726038 1.7313485741615298 -0.090793885290622697 -1.0053566694259641 -0.42596523463726038 0.67133131623268139 -1.0039461255073545 -1.1708780527114866 0.55507114529609691 -0.99360433220863331 -0.33625254034996027 0.56087446212768566 1.5570636391639712 1.0512721538543703 -0.41330277919769282 1.8750824332237246 -1.2958669066429136 -0.15848610550165174 1.208607077598572 -0.76494011282920826 -0.81121021509170521 0.85398980975151073 -1.6652725934982298 -0.15682714059948918 1.1510444879531863 1.4775097370147707 1.4728554487228396 1.1410096287727358 -0.27973499894142145 2.7036424875259404 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 4 -2 6 40 9 8 27 -3 11 25 13 14 -13 55 44 22 19 32 21 -6 -18 41 54 -7 -23 38 36 56 51 -32 -11 43 -21 -34 -28 -37 -8 -19 -5 -20 -29 -24 45 46 -15 48 -45 -49 -50 -31 -36 -54 -10 -4 59 58 -58 -30 -16 -42 -right_child=3 2 15 5 20 10 7 -9 24 18 -12 12 -14 16 60 -17 17 39 23 34 -22 26 33 -25 -26 -27 28 42 29 30 31 -33 35 -35 52 37 -38 -39 -40 -41 61 -43 -44 47 -46 -47 -48 49 50 -51 -52 -53 53 -55 -56 -57 57 -59 -60 -61 -62 -63 -leaf_value=0.13557368128277333 0.13657910854670494 0.06601611682254202 -0.077858992106740965 0.060788686877082804 0.073406411296858473 -0.09360995850006909 0.037962052563105947 -0.11239125127674927 0.12685614941583306 -0.086541730339634304 0.13030890751029348 -0.0038450920306734946 0.13068979581492093 0.02245376605105559 0.085617405254202791 -0.12120024286786629 0.13291059907074582 0.15915620832294972 -0.029319630191528629 -0.00063921056133360251 -0.0072259491476477289 0.032213270837148582 0.047535687414767665 -0.04006871628535344 -0.110920024122933 0.1362651605022249 -0.033276643880435705 0.054542547677769941 -0.058970527866441504 -0.0052413850667646297 0.11369509025178909 -0.067125230641197201 -0.1180642431272446 0.081386480507888784 -0.010178419675391806 0.08002153359255855 0.16285161637222706 -0.084969148029288247 0.13749644781581871 -0.13423268279934558 -0.06505325017563332 0.10208029559061914 -0.034532154250236188 -0.15183129736588907 -0.10861499876942218 0.10475511919909622 -0.046086787480452168 0.13305312001740704 -0.068670478342656258 -0.026490373517967181 0.088415344512374872 -0.091352437738352901 -0.081309661740005701 -0.022177152752834023 -0.0044390325917311978 -0.10389774888472283 -0.025896445444226542 -0.063056230680629835 0.17961060297845907 -0.16274536370092202 0.12991079313857742 0.089059323091842765 -leaf_weight=1732.6260898858309 14223.171266973019 473.73424929380417 4341.0237987935543 147.26246252655983 176.494963362813 48.620824873447418 267.7653302103281 520.70945706963539 82.956139177083969 3317.7240589857101 1096.5955583453178 965.48766043782234 643.23518075048923 590.94654749333858 2283.0116169303656 7992.9454456567764 407.97738921642303 21.949863582849503 124.93978016078472 1644.0930242240429 1271.156805023551 259.07460194826126 868.61804920434952 432.48366369307041 221.33878023922443 868.04877299070358 142.65745550394058 719.22036315500736 6410.4827152788639 198.71859905123711 127.16511069238186 213.09297731518745 125.58823479712009 914.48008418083191 787.88268768787384 223.78045365214348 76.422883480787277 92.272231891751289 518.23080553114414 183.00618994235992 692.49121737480164 342.002980068326 259.96960653364658 126.93005357682705 148.27521876990795 156.87052491307259 788.77705426514149 115.28462113440037 166.57044097781181 163.69037157297134 105.16993886232376 5117.0867483913898 1220.3531514853239 653.52112759649754 2844.4581853896379 3557.4562506228685 528.67229659855366 568.29592843353748 105.74688777327538 144.75600738823414 911.28077167272568 58.317444115877151 -leaf_count=8781 72263 1992 19404 636 796 200 1091 2309 379 14729 5205 4055 3015 2479 10351 38906 1899 90 504 6643 5214 1055 3676 1750 942 4174 633 2896 27397 860 568 965 554 4216 3181 927 318 382 2606 801 2923 1456 1088 516 638 663 3280 489 693 695 436 22759 5100 2647 11525 16356 2219 2418 443 603 4272 244 -internal_value=0 -0.0557119 -0.0629005 0.100323 -0.0405005 0.0589786 -0.0230609 0.0048749 0.0173011 -0.0391464 0.0662245 0.0595178 0.0533289 0.0477493 0.0745564 -0.105487 0.0243152 0.0450471 -0.0447037 -0.0514149 -0.0574095 -0.0619437 0.0556109 0.015476 -0.00846495 0.124072 -0.0636633 0.0632628 -0.0654853 -0.0671287 -0.0828042 0.00045299 -0.0776413 0.042795 -0.028517 -0.0107792 0.0351398 0.0318522 0.103588 -0.102812 -0.0344106 0.0669216 0.0308937 0.0199716 -0.0135056 -0.00432791 -0.0167303 -0.0153606 -0.053708 0.0394399 -0.00787456 -0.0881334 -0.0457364 -0.060687 -0.000718431 -0.0895868 -0.0557003 -0.025386 0.00835808 -0.0612621 0.0982536 -0.0530829 -internal_weight=0 47836.9 46104.3 26696.1 30212.8 12472.9 14873 5434.65 4913.94 9438.38 11574.8 10478.2 9561.56 8918.33 4159.78 15891.4 4758.55 3073.68 8964.64 8065.21 15339.8 14068.7 2868.72 899.426 3148.75 916.67 13892.2 1765.19 13633.1 13414 5656.06 340.258 3759.36 2460.74 4305.85 441.641 219.08 316.053 785.996 204.956 898.071 466.943 979.19 1546.26 1684.87 1536.59 1379.72 677.645 398.67 278.975 271.74 5315.81 2661.76 1873.87 2927.41 7898.48 7757.95 1202.72 634.419 6555.24 3194.29 750.809 -internal_count=340305 212396 203615 127909 128949 55646 62701 22836 20527 39865 51843 46638 42264 39249 18678 74666 20571 13511 37873 34163 66248 61034 12620 3710 12846 4374 60238 7681 59183 58232 25152 1533 16592 10721 17571 1863 951 1309 3697 891 3803 1960 3984 6505 7060 6422 5759 2829 1645 1184 1129 23619 10928 7747 11904 35760 33080 5080 2662 28000 14623 3167 -is_linear=0 -shrinkage=0.1 - - -Tree=7 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 0 1 3 2 1 7 7 7 0 7 7 1 2 2 7 7 3 8 7 3 3 2 2 3 2 8 8 8 8 8 2 1 8 8 1 8 7 8 8 7 8 8 8 8 3 8 8 2 3 0 0 0 3 8 8 8 0 8 -split_gain=35383 5589.33 3905.89 3776.8 742.772 709.609 551.809 664.829 483.811 456.555 454.125 478.5 393.511 381.876 361.721 377.52 386.911 339.657 307.33 297.066 403.106 294.933 261.352 236.73 212.267 276.794 259.118 211.721 209.328 206.753 204.233 254.148 230.132 207.018 187.231 175.852 172.395 274.39 167.924 166.972 164.23 166.572 196.88 147.116 146.01 178.091 147.722 143.759 140.114 139.294 133.622 132.935 132.784 132.558 130.726 128.705 166.67 135.089 537.595 149.639 180.991 125.42 -threshold=1.0000000180025095e-35 -1.0289423763751981 -0.24122850596904752 0.41995026171207434 -0.60601681470870961 0.47061920166015631 -0.76494011282920826 -0.067114436998963342 -0.79011020064353932 -0.53931042551994313 2.4357533454895024 2.2919597625732426 0.19195760041475299 -0.38312204182147974 0.1215791590511799 2.0388435125350957 1.0000000180025095e-35 -0.42596523463726038 2.4357533454895024 -0.29721513390541071 -0.89694124460220326 -1.6652725934982298 0.084623351693153395 7.9471342563629159 -0.76494011282920826 -0.42596523463726038 -0.24653984606266019 -0.89694124460220326 -0.24653984606266019 -0.96772852540016163 1.5684867501258852 1.1580279469490053 0.70141646265983593 0.71233883500099193 -0.42596523463726038 -0.9131081998348235 0.38476034998893743 0.38478974997997289 -1.0053566694259641 -1.0039461255073545 -1.921579837799072 1.5570636391639712 1.6428202986717226 2.4995682239532475 0.46185493469238287 0.40141405165195471 0.42557564377784735 0.71233883500099193 0.95107451081275951 0.40141405165195471 0.64074057340621959 -0.15682714059948918 0.42307002842426306 -0.99360433220863331 -0.58321756124496449 0.85313642024993908 0.8190734088420869 -1.3048880100250242 -1.275070786476135 -0.77818816900253285 1.2635232210159304 -0.073026757687330232 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 4 -2 6 52 9 8 29 -3 11 13 43 40 15 16 17 48 21 20 34 -6 30 -17 26 -26 -23 53 -10 39 31 32 55 -34 -11 -27 -22 -38 -36 -8 -7 61 -43 51 45 -19 -47 -40 -15 -28 -51 -4 -5 -21 -37 -18 -57 -58 -59 -60 -61 -42 -right_child=3 2 12 5 18 10 7 -9 28 19 -12 -13 -14 14 -16 23 22 44 -20 27 36 24 -24 -25 25 35 49 -29 -30 -31 -32 -33 33 -35 38 54 37 -39 47 -41 41 42 -44 -45 -46 46 -48 -49 -50 50 -52 -53 -54 -55 -56 56 57 58 59 60 -62 -63 -leaf_value=0.1307711672590863 0.13185128733333404 0.064812816116550695 -0.075030234777900176 0.083505318196732473 0.070961402348268712 -0.085227680884450763 0.034176355467558137 -0.10509043136854944 -0.00064006627074177655 -0.079427927982523877 0.10794876089124028 -0.14513434935036137 -0.11669219723845924 0.019218620504788998 0.10008416063941106 0.10447469903062337 -0.014809966122621698 0.062417057132302639 0.022112182338378462 -0.026391168381679826 -0.0066808779871671345 -0.039396885207924379 -0.14401329960244758 -0.11323848990849433 -0.04818144028634868 0.079253697193630124 -0.021043480228937386 -0.036126026254475328 -0.10283451300529699 0.027864334564553114 -0.081207687113249577 0.11507201577064737 -0.19120540774871642 -0.023695645429891762 -0.10777397203028426 -0.11711577838156047 -0.15420053827901192 -0.03444071948678061 0.07204300856027028 0.13254677888210395 0.081249165234548892 -0.20693082652872438 0.11495837739672232 -0.011349355921060282 0.10848804099276975 -0.26739184679519551 0.011795288521794432 -0.076651891248952941 -0.094207149053916861 0.19158038279455411 -0.032450698669920945 -0.10105241696778747 -0.044368128895742334 0.094388840747716349 -0.073339090365344847 0.12479885605469457 0.10317530256646899 -0.1822031269542097 0.13466124990540587 -0.034069829323339622 0.072260506889391826 0.13631167973694436 -leaf_weight=1622.3096879571676 13306.989917144179 430.1712691038847 4302.1470460742712 90.410361588001251 182.39582592248917 48.089887693524361 267.68573613464832 502.16803561151028 2923.0686435103416 3236.637579575181 1773.2976521104574 120.56886732578278 7041.4345580339432 309.34996344149113 1048.5366927236319 954.9667137414217 646.53552749752998 1234.3990990817547 530.1876575499773 124.92362134158611 2226.6431866139174 1106.6567640006542 117.55829508602619 52.700248539447784 7386.7053927779198 70.560405597090721 90.611370608210564 431.75275781750679 215.1898141503334 977.05306252837181 234.97985816001892 236.14226090908051 91.910316824913025 373.97173842787743 125.65035183727741 796.26615828275681 213.20106665790081 1863.5751914680004 224.15155853331089 485.50898364186287 827.61186444759369 37.219403833150864 38.820454642176628 266.38508561253548 739.05940829217434 29.813096702098846 52.021848157048225 91.58573979139328 168.08067572116852 109.44436626136303 35.181396663188934 3610.8099528849125 797.62097728252411 333.34049712121487 4759.4562739878893 195.94969698786736 182.12400595843792 126.73849537968636 137.23880179226398 317.18256616592407 323.20287238061428 827.07122235000134 -leaf_count=8781 72263 1835 19918 405 832 200 1091 2309 11904 14785 8693 551 36181 1251 5022 4653 2644 5575 2165 504 9030 4735 531 247 31861 336 391 1750 942 3984 1020 1034 377 1579 575 3857 942 7677 930 2606 4015 160 187 1126 3443 128 230 381 695 471 157 17441 3398 1456 21443 946 766 521 581 1284 1366 4144 -internal_value=0 -0.051583 -0.0581628 0.0950596 -0.0370769 0.0547085 -0.0210617 0.00445999 0.0157597 -0.0357978 0.0615046 0.0528094 -0.0993626 0.0553615 0.0455673 0.0368052 0.0265238 0.0552683 -0.0526719 -0.0406763 -0.0467869 -0.0553994 0.0021226 0.0930885 -0.0570049 -0.0609099 -0.019137 0.0141225 -0.00764752 0.0582148 0.00811689 0.0160947 0.00633509 -0.0567424 -0.071096 -0.0776209 -0.0260106 -0.0467352 -0.00999916 0.0975859 0.097056 0.102121 -0.0425975 -0.0844439 0.0729183 0.052946 -0.0899147 0.0289112 -0.0207132 0.076174 0.137083 -0.0869046 -0.0313493 0.061464 -0.0796133 0.0215695 0.03991 0.0246002 0.0087764 0.0399026 0.0195951 0.108771 -internal_weight=0 46582.9 44960.5 25440.2 29739.8 12133.2 14672.3 5370.67 4868.51 9301.63 11245.1 9471.84 15220.8 9351.28 7572.46 6523.93 5516.26 2532.72 15067.5 8871.46 7981.44 14537.3 2983.53 1007.67 14354.9 13013 1341.89 890.017 3138.26 1730.25 2865.98 2631 2394.85 465.882 3678.03 5626.28 4303.42 2076.78 441.388 753.195 1778.81 1730.72 76.0399 8179.34 2055.29 1316.23 81.8349 315.737 477.431 235.237 144.626 7912.96 888.031 458.264 5555.72 1928.97 1282.44 1086.49 904.363 777.624 640.385 1654.68 -internal_count=340305 212396 203615 127909 128949 55646 62701 22836 20527 39865 51843 43150 74666 42599 33893 28871 23971 11322 66248 38030 34320 64083 12649 4900 63251 57497 5754 3710 12846 7681 12118 11098 10064 1956 16671 25636 17649 8619 1886 3697 8706 8506 347 38485 9376 5933 358 1311 1946 1019 628 37359 3803 1960 25300 8108 5464 4518 3752 3231 2650 8159 -is_linear=0 -shrinkage=0.1 - - -Tree=8 -num_leaves=63 -num_cat=0 -split_feature=9 3 0 0 0 1 3 2 1 7 0 7 7 1 3 2 2 1 7 7 3 8 7 8 8 7 2 3 3 2 3 2 2 3 8 8 8 8 1 7 7 7 7 2 0 1 2 8 7 7 7 3 8 8 2 7 7 8 7 8 8 8 -split_gain=30080.9 4763.53 3582.31 3482.91 618.522 606.847 455.815 559.385 402.036 393.657 388.515 377.733 336.663 328.815 366.228 296.962 527.658 361.76 266.774 247.818 364.834 223.21 207.501 189.471 186.499 183.891 175.93 174.696 171.887 161.788 160.774 190.269 198.961 213.123 160.073 153.475 147.911 242.335 143.459 142.517 341.55 136.536 161.021 136.288 129.635 128.898 125.191 119.846 119.372 228.174 119.361 118.539 117.874 117.839 116.406 141.851 113.607 112.629 115.928 111.33 134.243 128.025 -threshold=1.0000000180025095e-35 -1.0289423763751981 0.41995026171207434 -0.24122850596904752 -0.60601681470870961 0.56286767125129711 -0.76494011282920826 -0.067114436998963342 -0.79011020064353932 2.8198474645614628 0.12355979532003404 -0.3375259786844253 -1.3274887800216673 3.0228273868560795 3.7230983972549443 -0.42596523463726038 1.0000000180025095e-35 1.3316050767898562 -0.27973499894142145 -0.29721513390541071 -0.89694124460220326 -1.6652725934982298 -1.0192310214042661 -0.77818816900253285 -0.96772852540016163 -1.921579837799072 -0.24653984606266019 -0.89694124460220326 0.55507114529609691 -0.42596523463726038 -0.89694124460220326 -0.42596523463726038 0.4711617529392243 -0.50093787908554066 -1.0039461255073545 -1.1708780527114866 0.38476034998893743 0.38478974997997289 -1.0053566694259641 1.9970077276229861 2.3508639335632329 -1.1447212696075437 -0.36696811020374293 -0.42596523463726038 0.71634078025817882 -0.29811820387840265 -0.24653984606266019 -1.6671294569969175 1.5623720884323122 2.2919597625732426 2.9366999864578252 2.0070835351943974 -1.6652725934982298 0.67133131623268139 -0.33625254034996027 2.8053728342056279 -0.27973499894142145 0.58425346016883861 -0.3375259786844253 1.1510444879531863 1.1410096287727358 1.4775097370147707 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 -1 -2 4 6 23 11 8 24 12 50 -3 25 14 15 -14 54 28 21 20 29 47 -23 51 34 -7 52 48 44 -13 43 59 -33 -34 -8 -16 -22 -38 -31 -20 -41 -30 -43 -24 -18 -25 -26 -6 -21 -50 -5 -4 -10 -40 55 -17 57 58 -56 60 -32 -61 -right_child=2 3 5 10 18 9 7 -9 26 -11 -12 19 13 -15 35 16 17 -19 39 27 36 22 30 45 46 -27 -28 -29 41 38 31 32 33 -35 -36 -37 37 -39 53 40 -42 42 -44 -45 -46 -47 -48 -49 49 -51 -52 -53 -54 -55 56 -57 -58 -59 -60 61 -62 -63 -leaf_value=0.12668670834521442 0.12785458938953667 0.055434595502980272 0.07967256862180748 -0.078116592322064027 -0.092881410006435863 -0.076823242023309715 0.030786250502072583 -0.098527900761528078 0.11957300379603736 0.12011290673711897 -0.10981655516907783 -0.074599786133678833 -0.018312901267398474 0.11962713053920289 0.14269516980239624 0.046948918511868692 -0.019271777814066568 0.06433590618778473 0.042234132567703725 0.13321934924279563 -0.0053784484955292245 0.03481446273790563 -0.026701938730242097 0.12269216289394394 0.041896273372609094 0.12657467090223992 -0.095631286541014326 -0.032588488160438973 -0.17078901198763233 -0.10412409070511089 -0.049874747178426854 -0.076378650939421699 -0.085269250393435622 0.077050369462540014 0.12835298822892224 -0.12372368016176682 -0.14652036599480553 -0.03129748698117378 0.065196839004703272 -0.11325968150483208 0.019743985142494924 0.11445341347912129 -0.0092224276392109446 0.14290586535370273 0.12462762152773199 -0.041934142120594819 -0.051047934323224298 0.10687668159813971 -0.073114778792225651 0.099563318144143265 0.092321524539698982 -0.070245960092570475 -0.0039324835760380103 -0.069754564634857436 0.11619889185169144 -0.16904743345987716 0.11954472914145552 0.10957876548333099 0.045177051356170249 0.012281282817863205 -0.14981334716443725 -0.053728904018608886 -leaf_weight=1512.7887055277824 12397.436287388206 461.37877413630486 283.00598740577698 7093.4969314485788 38.916266471147537 47.214099168777466 267.45765554904938 483.05917684733868 79.435012236237526 958.63670675456524 7415.5092761665583 3130.9544554501772 671.69646497070789 583.09837171435356 24.630428522825241 246.25650504231453 94.026319086551666 1588.6502358168364 431.68836668133736 139.83414746820927 2209.2745899409056 258.79744625091553 139.39517170190811 50.030873671174049 800.43623700737953 759.12637622654438 208.60560546815395 430.80807557702065 49.462265893816948 116.15111157298088 6233.1938730180264 5020.7660703361034 146.86268365383148 180.06363812088966 452.91241975128651 177.06047028303146 202.48834007978439 1852.0327226668596 222.56672593951225 303.09342260658741 531.91178865730762 111.12537731230259 1998.5938349664211 71.769042819738388 187.3355226367712 963.29959496855736 176.95944994688034 131.59333375096321 139.10697911679745 170.08909821510315 41.328847795724869 64.821331769227982 2843.5339935421944 91.224932178854942 288.12873055040836 34.68736632168293 807.51424327492714 657.81091266870499 1135.856561601162 628.20615799725056 137.37042880058289 551.97545427083969 -leaf_count=8781 72263 1992 1232 34218 157 199 1091 2309 379 5064 40278 14729 2811 3015 106 1095 398 7522 1761 614 8965 1055 633 212 3229 4080 942 1750 199 554 27397 23619 685 848 2606 814 942 7664 927 1279 2174 522 8605 318 920 4222 755 639 583 763 170 281 11525 382 1511 143 4162 3290 5243 2662 603 2418 -internal_value=0 -0.047778 0.0903311 -0.053799 -0.0339295 0.0508479 -0.0192329 0.00407361 0.0143364 0.0590501 -0.0937882 -0.032741 0.0528637 0.0471067 0.0418685 0.045278 0.0512109 0.0271969 -0.0483013 -0.0374147 -0.0430727 -0.0524071 -0.0538572 -0.0119475 0.0535224 0.114665 -0.00690806 0.0129036 0.00302158 -0.066701 -0.0556077 -0.0570247 -0.0714567 0.00413273 0.0921286 -0.0911887 -0.0233396 -0.0426536 -0.00917991 -0.0044164 -0.0285342 -0.00655842 -0.00270804 0.0309432 0.0765389 -0.0338061 0.0250685 0.0612849 0.0565496 0.0218756 -0.0771293 0.0517337 -0.000576081 0.025964 0.0817312 0.0202804 0.0877064 0.0753566 0.0595476 -0.0468034 -0.0520298 -0.0185919 -internal_weight=0 45347 24179.5 43834.2 29283.9 11782.1 14478.3 5312.4 4829.34 10420.9 14550.3 9165.91 9462.27 8655.93 8072.84 7871.14 7199.45 4029.19 14805.6 8704.53 7824.69 13538.9 13368.4 1361.16 1697.77 806.34 3131.57 879.838 2440.54 3560.9 13109.6 12898.4 5347.69 326.926 720.37 201.691 4263.8 2054.52 429.943 1266.69 835.005 2159.18 2109.72 211.164 281.362 1013.33 977.396 170.51 449.03 309.196 7134.83 347.827 2922.97 313.792 3170.25 280.944 2889.31 2081.8 1423.99 7550.75 6370.56 1180.18 -internal_count=340305 212396 127909 203615 128949 55646 62701 22836 20527 49699 74666 39865 44635 40356 37341 36421 33610 18166 66248 37873 34163 61034 60238 5947 7681 4279 12846 3710 10644 16592 59183 58232 25152 1533 3697 920 17571 8606 1863 5214 3453 9326 9127 951 1318 4434 3984 796 1960 1346 34388 1513 11904 1309 15444 1238 14206 10044 6754 33080 28000 5080 -is_linear=0 -shrinkage=0.1 - - -Tree=9 -num_leaves=63 -num_cat=0 -split_feature=9 0 0 3 0 1 8 7 7 7 1 2 2 7 0 3 3 7 7 2 7 0 3 8 8 8 8 2 8 8 8 8 8 2 1 2 7 8 8 8 8 8 8 8 8 2 2 3 5 7 7 2 3 0 7 3 2 2 0 1 8 8 -split_gain=25601.2 4098.63 3426.55 2943.19 782.89 520.932 478.745 387.365 352.431 333.663 308.629 364.996 446.154 306.98 252.357 360.158 230.542 196.212 209.096 170.067 163.922 162.942 477.975 157.497 391.424 382.053 291.143 374.679 321.21 300.645 276.896 217.825 194.481 190.267 156.817 148.85 148.54 146.391 247.639 179.233 145.509 184.055 158.247 141.916 140.715 137.596 176.283 170.87 142.822 137.394 210.44 135.754 127.284 125.465 121.579 118.859 123.963 118.74 118.503 117.022 116.962 135.266 -threshold=1.0000000180025095e-35 -0.40082339942455286 0.41995026171207434 -1.0289423763751981 -0.013235819933470337 0.47061920166015631 -0.45583815872669214 2.8198474645614628 -0.29721513390541071 -1.3274887800216673 3.0228273868560795 0.11231096461415292 -0.33625254034996027 -0.3375259786844253 0.9215342402458192 0.95107451081275951 5.5711138248443612 -0.27973499894142145 -1.0192310214042661 -0.24653984606266019 -1.921579837799072 -0.65161538124084462 -0.76494011282920826 0.93881765007972728 0.74979063868522655 1.5281258225440981 0.44872735440731054 -0.42596523463726038 0.38476034998893743 0.57121005654335033 -0.073026757687330232 0.55099299550056469 1.6163944005966189 -0.42596523463726038 -0.9131081998348235 -0.15682714059948918 -1.10901403427124 0.98129758238792431 1.0396918058395388 0.9683690369129182 -1.0039461255073545 -1.0167954564094541 -0.96772852540016163 0.20498233288526538 -0.090793885290622697 3.5213935375213627 0.56087446212768566 1.8750824332237246 1.0000000180025095e-35 1.9970077276229861 2.3508639335632329 1.0000000180025095e-35 0.42307002842426306 -0.85680872201919545 -1.9059542417526243 -0.10493447259068488 0.20202366262674334 0.29173635691404348 0.48834806680679327 -0.69786170125007618 1.0679657459259035 1.1510444879531863 -decision_type=2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -left_child=1 3 -2 -1 17 -4 21 9 13 20 11 12 -11 54 52 -16 36 18 -3 -10 -7 22 53 24 26 37 28 -28 30 31 44 -29 -27 -26 -31 -20 -17 39 -39 -25 41 -24 -42 57 -15 46 -38 48 -48 -19 -51 -14 -13 -5 -8 -37 -57 -32 -6 -41 -40 -62 -right_child=2 4 5 6 58 7 8 -9 19 10 -12 14 51 23 15 16 -18 49 35 -21 -22 -23 40 25 33 32 27 29 -30 34 43 -33 -34 -35 -36 55 45 38 60 59 42 -43 -44 -45 -46 -47 47 -49 -50 50 -52 -53 -54 -55 -56 56 -58 -59 -60 -61 61 -63 -leaf_value=0.12317301867561051 0.12444472005708432 0.058677429451554311 -0.027241830137297186 -0.041600277180095679 -0.090548521102326959 -0.071262273197881199 -0.072766881491911947 0.11493837900040074 0.023847916148752826 -0.0060356984288631385 0.11439814851757529 0.08170764475394493 0.081411466304970537 -0.053734662903695343 0.10820259040720881 -0.1767137732931037 -0.10828288934998331 0.041629945767349852 -0.056447655736524785 -0.068954787299756917 0.12193772104623293 -0.021281203570327839 0.035395729886513218 -0.023959992311151243 -0.056199327773655432 -0.10506469769732259 -0.035253406814981132 -0.010834150665965811 -0.10665210903762257 0.17520324470610638 0.0025023531548814585 0.18604554387371836 -0.029786357440349298 -0.11939356234363846 -0.062591761800579446 -0.089449366037545497 0.045937460753092921 -0.08600841289971467 0.083120061927875452 -0.068409491805713915 0.12294161732207268 -0.14514199636562561 0.035721506956446623 -0.045000505994931501 -0.12334349103905791 0.13336046621869049 -0.088432112343249519 0.026584190241937568 0.10066134904905666 -0.11363161336065926 0.0060954401724936883 0.037539269926716023 -0.041117406955899855 0.0024368032173967321 0.045770731509022382 -0.081564114052198877 0.030422841292162769 0.19204392673868997 -0.11377394246369721 0.16565391306660573 -0.060335992536784434 0.0021866210742939197 -leaf_weight=1403.5755200535059 11509.068749010563 136.03306156396866 868.89319652318954 1181.7544212639332 5087.4308403283358 46.733813107013702 102.94226931035519 930.07822573184967 1305.8181105107069 949.5278155952692 550.47245353460312 93.035764306783676 2862.4922042191029 2367.0010645836592 474.67811641097069 37.316917479038239 148.61605629324913 286.91104447841644 5806.4992717802525 232.6510166823864 728.3931081444025 3391.5781914740801 601.10506185889244 146.16022363305092 858.61416545510292 784.01922869682312 1109.0878999084234 76.079304993152618 883.06351241469383 33.12769478559494 1212.7936470359564 215.02218170464039 610.37426318228245 1070.4014222919941 170.28300312161446 3037.6930948197842 1011.5164300203323 433.0667437016964 145.1470342874527 25.337898015975952 434.61723920702934 62.324053779244423 398.98300924897194 867.62762340903282 331.02332139015198 109.93057346343994 332.69235534965992 426.91243819892406 45.392929494380951 246.37049682438374 363.27002328634262 935.90084075927734 906.0414877384901 1429.6581575721502 542.57820971310139 220.86717380583286 178.91599005460739 33.977080389857292 3866.4861959218979 136.05387257039547 425.21557585895061 1858.1758723855019 -leaf_count=8767 72263 604 3803 5231 28006 200 429 5205 5419 4055 3015 403 14623 10630 2428 154 715 1260 27177 1011 4174 14791 2496 640 3875 3547 4849 351 4069 148 5340 948 2698 5535 799 15246 4688 1900 630 113 2662 251 1629 3848 1505 533 1434 1978 196 1152 1483 4181 3861 6074 2293 1130 913 164 22316 548 1884 8035 -internal_value=0 -0.0442901 0.0859043 -0.0174905 -0.0789619 0.0471942 -0.0259006 0.0533018 -0.0356852 0.0473671 0.0418775 0.0370874 0.0552751 -0.0405338 0.0130075 0.0294926 0.0118055 -0.0601281 -0.0644 0.00981407 0.110289 -0.0050585 0.00833358 -0.0436885 -0.0512034 -0.0284924 -0.0406157 -0.00297686 -0.0512132 0.0694127 -0.0410403 0.134591 -0.0721128 -0.0912655 -0.0238642 -0.0662112 0.0208937 -0.00929992 -0.0163459 0.0562588 0.0533827 0.0184356 0.0811958 -0.0139443 -0.062275 0.0247215 0.018147 -0.016773 -0.0657295 -0.0154337 -0.0422892 0.0706016 -0.0296797 -0.0174915 0.0268673 -0.0827035 -0.0314463 0.00766775 -0.100578 0.128907 -0.00392337 -0.00945641 -internal_weight=0 44109.7 22967.7 24879.2 19230.5 11458.6 23475.7 10589.7 15975.6 9659.65 8884.53 8334.05 4747.92 14437.2 3586.13 2587.06 2112.38 10276.6 9380.01 1538.47 775.127 7500.02 4108.44 13791.7 9228.1 4563.55 7299.09 1603.6 5695.49 494.512 4812.42 291.101 1394.39 1929.02 203.411 9243.98 1963.76 3169.16 2861.61 307.552 1497.03 663.429 833.6 2114.4 2698.02 1926.44 1816.51 804.998 378.085 896.552 609.641 3798.39 999.077 2611.41 645.52 3437.48 399.783 1246.77 8953.92 161.392 2428.54 2283.39 -internal_count=340305 212396 127909 113109 99287 55646 104342 51843 71208 46638 42264 39249 22859 64778 16390 12126 9698 48965 45070 6430 4374 33134 18343 62056 42061 19995 32651 7095 25556 2246 21487 1299 6245 9410 947 44466 8983 13750 12449 1301 7038 2747 4291 9352 12135 8829 8296 3608 1630 3895 2635 18804 4264 11305 2722 17289 2043 5504 50322 661 10549 9919 -is_linear=0 -shrinkage=0.1 - - -end of trees - -feature_importances: -Column_8=175 -Column_7=116 -Column_3=100 -Column_2=93 -Column_0=74 -Column_1=50 -Column_9=10 -Column_5=2 - -parameters: -[boosting: gbdt] -[objective: binary] -[metric: binary_logloss] -[tree_learner: serial] -[device_type: cpu] -[data_sample_strategy: bagging] -[data: ] -[valid: ] -[num_iterations: 10] -[learning_rate: 0.1] -[num_leaves: 63] -[num_threads: 0] -[seed: 0] -[deterministic: 0] -[force_col_wise: 0] -[force_row_wise: 0] -[histogram_pool_size: -1] -[max_depth: -1] -[min_data_in_leaf: 20] -[min_sum_hessian_in_leaf: 0.001] -[bagging_fraction: 1] -[pos_bagging_fraction: 1] -[neg_bagging_fraction: 1] -[bagging_freq: 0] -[bagging_seed: 3] -[feature_fraction: 1] -[feature_fraction_bynode: 1] -[feature_fraction_seed: 2] -[extra_trees: 0] -[extra_seed: 6] -[early_stopping_round: 0] -[first_metric_only: 0] -[max_delta_step: 0] -[lambda_l1: 0] -[lambda_l2: 0] -[linear_lambda: 0] -[min_gain_to_split: 0] -[drop_rate: 0.1] -[max_drop: 50] -[skip_drop: 0.5] -[xgboost_dart_mode: 0] -[uniform_drop: 0] -[drop_seed: 4] -[top_rate: 0.2] -[other_rate: 0.1] -[min_data_per_group: 100] -[max_cat_threshold: 32] -[cat_l2: 10] -[cat_smooth: 10] -[max_cat_to_onehot: 4] -[top_k: 20] -[monotone_constraints: ] -[monotone_constraints_method: basic] -[monotone_penalty: 0] -[feature_contri: ] -[forcedsplits_filename: ] -[refit_decay_rate: 0.9] -[cegb_tradeoff: 1] -[cegb_penalty_split: 0] -[cegb_penalty_feature_lazy: ] -[cegb_penalty_feature_coupled: ] -[path_smooth: 0] -[interaction_constraints: ] -[verbosity: 1] -[saved_feature_importance_type: 0] -[use_quantized_grad: 0] -[num_grad_quant_bins: 4] -[quant_train_renew_leaf: 0] -[stochastic_rounding: 1] -[linear_tree: 0] -[max_bin: 255] -[max_bin_by_feature: ] -[min_data_in_bin: 3] -[bin_construct_sample_cnt: 200000] -[data_random_seed: 1] -[is_enable_sparse: 1] -[enable_bundle: 1] -[use_missing: 1] -[zero_as_missing: 0] -[feature_pre_filter: 1] -[pre_partition: 0] -[two_round: 0] -[header: 0] -[label_column: ] -[weight_column: ] -[group_column: ] -[ignore_column: ] -[categorical_feature: ] -[forcedbins_filename: ] -[precise_float_parser: 0] -[parser_config_file: ] -[objective_seed: 5] -[num_class: 1] -[is_unbalance: 0] -[scale_pos_weight: 1] -[sigmoid: 1] -[boost_from_average: 1] -[reg_sqrt: 0] -[alpha: 0.9] -[fair_c: 1] -[poisson_max_delta_step: 0.7] -[tweedie_variance_power: 1.5] -[lambdarank_truncation_level: 30] -[lambdarank_norm: 1] -[label_gain: ] -[lambdarank_position_bias_regularization: 0] -[eval_at: ] -[multi_error_top_k: 1] -[auc_mu_weights: ] -[num_machines: 1] -[local_listen_port: 12400] -[time_out: 120] -[machine_list_filename: ] -[machines: ] -[gpu_platform_id: -1] -[gpu_device_id: -1] -[gpu_use_dp: 0] -[num_gpu: 1] - -end of parameters - -pandas_categorical:null diff --git a/embedding_gene/keywords_gene.py b/embedding_gene/keywords_gene.py index 41d5976..272c4cc 100644 --- a/embedding_gene/keywords_gene.py +++ b/embedding_gene/keywords_gene.py @@ -45,7 +45,7 @@ def add_unknown_words(word_vecs, vocab,k): # print(vectors) # 将结果存储到csv文件中 -with open('tweet_keywords_embedding_new.csv', 'w', newline='', encoding='utf-8') as f: +with open('tweet_keywords_embedding.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) # 写入表头 writer.writerow(['word', 'vector']) diff --git a/lbf/__init__.py b/lbf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lbf/__pycache__/bf.cpython-311.pyc b/lbf/__pycache__/bf.cpython-311.pyc new file mode 100644 index 0000000..fe3858e Binary files /dev/null and b/lbf/__pycache__/bf.cpython-311.pyc differ diff --git a/lbf/__pycache__/learn_bf.cpython-311.pyc b/lbf/__pycache__/learn_bf.cpython-311.pyc new file mode 100644 index 0000000..093b815 Binary files /dev/null and b/lbf/__pycache__/learn_bf.cpython-311.pyc differ diff --git a/lbf/bf.py b/lbf/bf.py new file mode 100644 index 0000000..516ca2f --- /dev/null +++ b/lbf/bf.py @@ -0,0 +1,103 @@ +import numpy as np +import pandas as pd +from sklearn.utils import murmurhash3_32 +from random import randint +import argparse + + +def hashfunc(m): + ss = randint(1, 99999999) + + def hash_m(x): + return murmurhash3_32(x, seed=ss) % m + + return hash_m + + +''' +Class for Standard Bloom filter +''' + + +class BloomFilter: + def __init__(self, n, hash_len): + self.n = n + self.hash_len = int(hash_len) + if (self.n > 0) & (self.hash_len > 0): + self.k = max(1, int(self.hash_len / n * 0.6931472)) + elif self.n == 0: + self.k = 1 + self.h = [] + for i in range(self.k): + self.h.append(hashfunc(self.hash_len)) + self.table = np.zeros(self.hash_len, dtype=int) + + def insert(self, key): + if self.hash_len == 0: + raise SyntaxError('cannot insert to an empty hash table') + for i in key: + for j in range(self.k): + t = self.h[j](i) + self.table[t] = 1 + + # def test(self, key): + # test_result = 0 + # match = 0 + # if self.hash_len > 0: + # for j in range(self.k): + # t = self.h[j](key) + # match += 1*(self.table[t] == 1) + # if match == self.k: + # test_result = 1 + # return test_result + + def test(self, keys, single_key=True): + if single_key: + test_result = 0 + match = 0 + if self.hash_len > 0: + for j in range(self.k): + t = self.h[j](keys) + match += 1 * (self.table[t] == 1) + if match == self.k: + test_result = 1 + else: + test_result = np.zeros(len(keys)) + ss = 0 + if self.hash_len > 0: + for key in keys: + match = 0 + for j in range(self.k): + t = self.h[j](key) + match += 1 * (self.table[t] == 1) + if match == self.k: + test_result[ss] = 1 + ss += 1 + return test_result + + +'''Run Bloom filter''' + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--size_of_Ada_BF', action="store", dest="R_sum", type=int, required=True, + help="size of the Ada-BF") + + results = parser.parse_args() + DATA_PATH = results.data_path + R_sum = results.R_sum + + data = pd.read_csv(DATA_PATH) + + negative_sample = data.loc[(data['label'] == -1)] + positive_sample = data.loc[(data['label'] == 1)] + + url = positive_sample['url'] + n = len(url) + bloom_filter = BloomFilter(n, R_sum) + bloom_filter.insert(url) + url_negative = negative_sample['url'] + n1 = bloom_filter.test(url_negative, single_key=False) + print('False positive items: ', sum(n1)) diff --git a/lbf/learn_bf.py b/lbf/learn_bf.py new file mode 100644 index 0000000..9b49dbe --- /dev/null +++ b/lbf/learn_bf.py @@ -0,0 +1,81 @@ +import numpy as np +import pandas as pd + +from bf import BloomFilter + + +def Find_Optimal_Parameters(max_thres, min_thres, R_sum, train_negative, positive_sample): + FP_opt = train_negative.shape[0] + + for threshold in np.arange(min_thres, max_thres + 10 ** (-6), 0.01): + url = positive_sample.loc[(positive_sample['score'] <= threshold), 'url'] + n = len(url) + bloom_filter = BloomFilter(n, R_sum) + bloom_filter.insert(url) + ML_positive = train_negative.loc[(train_negative['score'] > threshold), 'url'] + bloom_negative = train_negative.loc[(train_negative['score'] <= threshold), 'url'] + BF_positive = bloom_filter.test(bloom_negative, single_key=False) + FP_items = sum(BF_positive) + len(ML_positive) + print('Threshold: %f, False positive items: %d' % (round(threshold, 2), FP_items)) + if FP_opt > FP_items: + FP_opt = FP_items + thres_opt = threshold + bloom_filter_opt = bloom_filter + + return bloom_filter_opt, thres_opt + + +def run(R_sum, path, model, X_query, y_query, query_urls): + data = pd.read_csv(path) + negative_sample = data.loc[(data['label'] == 0)] + positive_sample = data.loc[(data['label'] == 1)] + # train_negative = negative_sample.sample(frac=0.8) + train_negative = negative_sample + + bloom_filter_opt, thresholds_opt = Find_Optimal_Parameters(0.99, 0.01, R_sum, train_negative, positive_sample) + fn = 0 + fp = 0 + cnt_ml = 0 + cnt_bf = 0 + total = len(X_query) + print(f"query count = {total}") + prediction_results = model.predict(X_query) + + for i in range(total): + true_label = y_query[i] + url = query_urls[i] + score = prediction_results[i] + if score > thresholds_opt: + if true_label == 0: + fp += 1 + cnt_ml += 1 + else: + if bloom_filter_opt.test(url) == 1 and true_label == 0: + fp += 1 + cnt_bf += 1 + elif bloom_filter_opt.test(url) == 0 and true_label == 1: + fn = fn + 1 + + print(f"fp: {fp}") + print(f"total: {total}") + print(f"fpr: {float(fp) / total}") + print(f"fnr: {float(fn) / total}") + print(f"cnt_ml: {cnt_ml}") + print(f"cnt_bf: {cnt_bf}") + return float(fp) / total + +# ''' +# Implement learned Bloom filter +# ''' +# if __name__ == '__main__': +# '''Stage 1: Find the hyper-parameters (spare 30% samples to find the parameters)''' +# bloom_filter_opt, thres_opt = Find_Optimal_Parameters(max_thres, min_thres, R_sum, train_negative, positive_sample) +# +# '''Stage 2: Run Ada-BF on all the samples''' +# ### Test URLs +# ML_positive = negative_sample.loc[(negative_sample['score'] > thres_opt), 'url'] +# bloom_negative = negative_sample.loc[(negative_sample['score'] <= thres_opt), 'url'] +# score_negative = negative_sample.loc[(negative_sample['score'] < thres_opt), 'score'] +# BF_positive = bloom_filter_opt.test(bloom_negative, single_key = False) +# FP_items = sum(BF_positive) + len(ML_positive) +# print('False positive items: %d' % FP_items) diff --git a/lbf/main.py b/lbf/main.py new file mode 100644 index 0000000..7653198 --- /dev/null +++ b/lbf/main.py @@ -0,0 +1,62 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd + +import lib.lgb_url +import learn_bf + +df_train = pd.read_csv('../dataset/url_train.csv') +df_test = pd.read_csv('../dataset/url_test.csv') +df_query = pd.read_csv('../dataset/url_query.csv') + +train_urls = df_train['url'] +test_urls = df_test['url'] +query_urls = df_query['url'] + +X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_train = df_train['url_type'].values.astype(np.float32) +X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_test = df_test['url_type'].values.astype(np.float32) +X_query = df_query.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_query = df_query['url_type'].values.astype(np.float32) + +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) +n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] +n_test = len(df_test) + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) + +train_results = pd.DataFrame({ + 'url': train_urls, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_urls, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) + +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + learn_bf.run( + R_sum=bloom_size*8, + path='url_results.csv', + model=bst, + X_query=X_query, + y_query=y_query, + query_urls=query_urls + ) + size *= 2 + diff --git a/lbf/yelp_main.py b/lbf/yelp_main.py new file mode 100644 index 0000000..482c5b8 --- /dev/null +++ b/lbf/yelp_main.py @@ -0,0 +1,116 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd + +import lib.lgb_url +import learn_bf +import lib.network +import lib.data_processing + +data_train = pd.read_csv('../dataset/tweet/tweet_train.csv') +data_test = pd.read_csv('../dataset/tweet/tweet_test.csv') +data_query = pd.read_csv('../dataset/tweet/tweet_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("tweet") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num + + +X_train, y_train, train_insert, train_true, train_false = yelp_embedding(data_train, word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false = yelp_embedding(data_test, word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false = yelp_embedding(data_query, word_dict=word_dict, + region_dict=region_dict) +print(query_insert) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + +# 清理内存 + + +# 3. 划分训练集和测试集 +X_train = X_train.values.astype(np.float32) +X_test = X_test.values.astype(np.float32) +y_train = y_train.values.astype(np.float32) +y_test = y_test.values.astype(np.float32) +# 4. 创建 LightGBM 数据集 +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, free_raw_data=False) +query_data = lgb.Dataset(X_query, label=y_query, free_raw_data=False) + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) +y_pred_query = bst.predict(X_query) + +train_results = pd.DataFrame({ + 'url': train_insert, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_insert, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) +query_results = pd.DataFrame({ + 'url': query_insert, + 'label': y_query, + 'score': y_pred_query +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) + +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + learn_bf.run( + R_sum=bloom_size * 8, + path='url_results.csv', + model=bst, + X_query=X_query, + y_query=y_query, + query_urls=query_insert + ) diff --git a/lgb_tweet_autoLBF_main.py b/lgb_tweet_autoLBF_main.py new file mode 100644 index 0000000..f0a5a30 --- /dev/null +++ b/lgb_tweet_autoLBF_main.py @@ -0,0 +1,258 @@ +import time + +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.bf_util +import lib.lgb_url + +data_train = pd.read_csv('dataset/tweet/tweet_train.csv') +data_test = pd.read_csv('dataset/tweet/tweet_test.csv') +data_query = pd.read_csv('dataset/tweet/tweet_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("tweet") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num + + +X_train, y_train, train_insert, train_true, train_false = yelp_embedding(data_train, word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false = yelp_embedding(data_test, word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false = yelp_embedding(data_query, word_dict=word_dict, + region_dict=region_dict) +print(query_insert) +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + +# 设置参数 +params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': 31, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, + 'verbose': -1 +} + + +def evaluate_thresholds(prediction_results, y_true, bf_bytes): + sorted_indices = np.argsort(prediction_results) + sorted_predictions = prediction_results[sorted_indices] + sorted_true = y_true[sorted_indices] + + fp = n_false + tp = 0 + best_thresh = 0 + best_fpr_lbf = 1.0 + + unique_sorted_predictions, idx = np.unique(sorted_predictions, return_index=True) + + n = len(unique_sorted_predictions) + for i in range(n): + thresh = unique_sorted_predictions[i] + + if i < n - 1: + count_1 = np.sum(sorted_true[idx[i]:idx[i + 1]]) + tp += count_1 + fp -= idx[i + 1] - idx[i] - count_1 + else: + count_1 = np.sum(sorted_true[idx[i]:n]) + tp += count_1 + fp -= n - idx[i] - count_1 + + bf_count = tp + fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) + fpr_lgb = fp / n_false + + fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf + + if fpr_lbf < best_fpr_lbf: + best_thresh = thresh + best_fpr_lbf = fpr_lbf + + print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') + return best_thresh, best_fpr_lbf + + +# def evaluate_thresholds(prediction_results, y_true, bf_bytes): +# sorted_indices = np.argsort(prediction_results) +# sorted_predictions = prediction_results[sorted_indices] +# sorted_true = y_true[sorted_indices] +# +# fp = n_false +# tp = 0 +# best_thresh = 0 +# best_fpr_lbf = 1.0 +# +# unique_sorted_predictions = np.unique(sorted_predictions) +# +# j = 0 +# for i in range(len(unique_sorted_predictions)): +# thresh = unique_sorted_predictions[i] +# +# while j < len(sorted_predictions) and sorted_predictions[j] == thresh: +# if sorted_true[j] == 1: +# tp += 1 +# else: +# fp -= 1 +# j += 1 +# +# bf_count = tp +# fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) +# fpr_lgb = fp / n_false +# fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf +# +# if fpr_lbf < best_fpr_lbf: +# best_thresh = thresh +# best_fpr_lbf = fpr_lbf +# +# print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') +# return best_thresh, best_fpr_lbf + + +# def evaluate_thresholds(prediction_results, y_true, bf_bytes): +# # 量化预测值并转换为整数 +# quantized_predictions = np.round(prediction_results * 1000).astype(int) +# +# # # 获取最大值和最小值以确定桶的范围 +# max_pred = np.max(quantized_predictions) +# min_pred = np.min(quantized_predictions) +# # +# # # 初始化桶的数量 +# num_buckets = max_pred - min_pred + 1 +# tp_count = np.zeros(num_buckets, dtype=int) +# fp_count = np.zeros(num_buckets, dtype=int) +# +# indices = quantized_predictions - min_pred +# np.add.at(tp_count, indices[y_true == 1], 1) +# np.add.at(fp_count, indices[y_true == 0], 1) +# best_thresh = 0 +# best_fpr_lbf = 1.0 +# +# tp = 0 +# fp = n_false +# for i in range(num_buckets): +# tp += tp_count[i] +# fp -= fp_count[i] +# +# bf_count = tp +# fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) +# fpr_lgb = fp / n_false +# fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf +# +# if fpr_lbf < best_fpr_lbf: +# best_thresh = (i + min_pred) / 1000.0 +# best_fpr_lbf = fpr_lbf +# +# print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') +# return best_thresh, best_fpr_lbf + + +start_time = time.perf_counter_ns() +initial_size = 64 * 1024 +max_size = 320 * 1024 + +# 循环,从32开始,每次乘以2,直到512 +size = initial_size +while size <= max_size: + print(f'size {size}') + bst = None + best_bst = None + best_fpr = 1.0 + best_threshold = 0.5 + epoch_each = 1 + epoch_now = 0 + epoch_max = 20 + best_epoch = 0 + for i in range(int(epoch_max / epoch_each)): + bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, + keep_training_booster=True) + bf_bytes = size - lib.lgb_url.lgb_get_model_size(bst) + if bf_bytes <= 0: + break + # prediction_results = bst.predict(X_test) + + # 对训练集进行预测 + train_pred = bst.predict(X_train) + test_pred = bst.predict(X_test) + + # 拼接预测结果 + all_predictions = np.concatenate([train_pred, test_pred]) + all_true_labels = np.concatenate([y_train, y_test]) + best_thresh, best_fpr_lbf = evaluate_thresholds(all_predictions, all_true_labels, bf_bytes) + + # best_thresh, best_fpr_lbf = evaluate_thresholds(prediction_results, y_test, bf_bytes) + + # 保存最佳模型 + if best_bst is None or best_fpr_lbf < best_fpr: + best_bst = bst.__copy__() + best_threshold = best_thresh + best_fpr = best_fpr_lbf + best_epoch = epoch_now + + epoch_now += epoch_each + + end_time = time.perf_counter_ns() + print(f'use {(end_time - start_time) / 1000000}ms') + + model_size = lib.lgb_url.lgb_get_model_size(best_bst) + print("模型在内存中所占用的大小(字节):", model_size) + print(f"best threshold:", best_threshold) + print(f"best epoch:", best_epoch) + + data_negative = lib.lgb_url.lgb_validate_url(best_bst, X_train, y_train, train_insert, X_test, y_test, test_insert, + best_threshold) + print(f"{len(data_negative)} insert into bloom filter") + bloom_size = size - model_size + + bloom_filter = lib.lgb_url.create_bloom_filter(dataset=data_negative, bf_size=bloom_size) + + # 访问布隆过滤器的 num_bits 属性 + num_bits = bloom_filter.num_bits + + # 将比特位转换为字节(8 bits = 1 byte) + memory_in_bytes = num_bits / 8 + print("memory of bloom filter: ", memory_in_bytes) + print("memory of learned model: ", model_size) + + fpr = lib.lgb_url.lgb_query_url(best_bst, bloom_filter, X_query, y_query, query_insert, best_threshold, False) + size = size + 64 * 1024 diff --git a/lgb_tweet_autoPLBF_main.py b/lgb_tweet_autoPLBF_main.py new file mode 100644 index 0000000..a594d86 --- /dev/null +++ b/lgb_tweet_autoPLBF_main.py @@ -0,0 +1,137 @@ +import time + +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.lgb_url +from plbf.FastPLBF_M import FastPLBF_M + +data_train = pd.read_csv('dataset/tweet/tweet_train.csv') +data_test = pd.read_csv('dataset/tweet/tweet_test.csv') +data_query = pd.read_csv('dataset/tweet/tweet_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("tweet") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + positive_insert = insert[data_train['is_in'] == 1] + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num, positive_insert + + +X_train, y_train, train_insert, train_true, train_false, train_positive_insert = yelp_embedding(data_train, + word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false, test_positive_insert = yelp_embedding(data_test, + word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false, query_positive_insert = yelp_embedding(data_query, + word_dict=word_dict, + region_dict=region_dict) +print(train_positive_insert) +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + +positive_samples = np.concatenate((X_train[y_train == 1], X_test[y_test == 1]), axis=0) +negative_samples = np.concatenate((X_train[y_train == 0], X_test[y_test == 0]), axis=0) + + +# 合并正类样本的URLs +positive_urls = pd.concat([train_positive_insert, test_positive_insert]) + +# 转换成list +positive_urls_list = positive_urls.tolist() + +# 设置参数 +params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': 31, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, +} + +size = 64 * 1024 +bst = None +best_bst = None +best_fpr = 1.0 +epoch_each = 1 +epoch_now = 0 +epoch_max = 20 +best_epoch = 0 +best_plbf = None + +start_time = time.perf_counter_ns() +for i in range(int(epoch_max / epoch_each)): + bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, + keep_training_booster=True) + epoch_now += epoch_each + bf_bytes = size - lib.lgb_url.lgb_get_model_size(bst) + if bf_bytes <= 0: + break + if epoch_now < 2: + continue + + pos_scores = bst.predict(positive_samples).tolist() + neg_scores = bst.predict(negative_samples).tolist() + plbf = FastPLBF_M(positive_urls_list, pos_scores, neg_scores, bf_bytes * 8.0, 50, 5) + fpr = plbf.get_fpr() + if best_bst is None or fpr < best_fpr: + best_bst = bst.__copy__() + best_fpr = fpr + best_epoch = epoch_now + best_plbf = plbf + +end_time = time.perf_counter_ns() +print(f'use {(end_time - start_time) / 1000000}ms') + +model_size = lib.lgb_url.lgb_get_model_size(best_bst) +print("模型在内存中所占用的大小(字节):", model_size) +print(f"best epoch:", best_epoch) + +fp_cnt = 0 +query_negative = X_query +query_neg_keys = query_insert +query_neg_scores = best_bst.predict(X_query) +total = len(query_negative) + +for key, score in zip(query_neg_keys, query_neg_scores): + if best_plbf.contains(key, score): + fp_cnt += 1 +print(f"fpr: {float(fp_cnt) / total}") diff --git a/lgb_tweet_main.py b/lgb_tweet_main.py index 0e5febf..bb96276 100644 --- a/lgb_tweet_main.py +++ b/lgb_tweet_main.py @@ -12,9 +12,7 @@ word_dict, region_dict = loading_embedding(dataset) # 将's'标签(1)和'b'标签(0)分别过滤出来 df_s, df_b = loading_data(word_dict=word_dict, region_dict=region_dict, dataset=dataset, - dataset_type="train") - - + dataset_type="train") # 取出标签为's'的所有样本数 s_count = len(df_s) @@ -37,7 +35,6 @@ X_query = validate_df.drop('label', axis=1).values.astype(np.float32) y_query = validate_df['label'].values.astype(np.float32) - # # 数据标准化 scaler = StandardScaler() X = scaler.fit_transform(X) @@ -57,7 +54,7 @@ 'learning_rate': 0.1, 'feature_fraction': 1, } -all_memory = 5 * 1024 * 1024 #yelp大小 64 * 1024 # tweet模型大小:5 * 1024 * 1024 +all_memory = 5 * 1024 * 1024 # yelp大小 64 * 1024 # tweet模型大小:5 * 1024 * 1024 num_round = 10 bst = lgb.train(params, train_data, num_round, valid_sets=[test_data]) @@ -65,13 +62,11 @@ model_size = lib.lgb_url.lgb_get_model_size(bst) print("模型在内存中所占用的大小(字节):", model_size) - threshold = 0.5 data_negative = lib.lgb_url.lgb_validate(bst, X_train, y_train, X_test, y_test, threshold) bloom_size = all_memory - model_size -bloom_filter = lib.lgb_url.create_bloom_filter(dataset=data_negative, bf_name='best_tweet_bf', - bf_size=bloom_size) +bloom_filter = lib.lgb_url.create_bloom_filter(dataset=data_negative, bf_size=bloom_size) # 访问布隆过滤器的 num_bits 属性 num_bits = bloom_filter.num_bits @@ -81,4 +76,4 @@ print("memory of bloom filter: ", memory_in_bytes) print("memory of learned model: ", model_size) -fpr = lib.lgb_url.lgb_query(bst, bloom_filter, X_query, y_query, threshold, False) \ No newline at end of file +fpr = lib.lgb_url.lgb_query(bst, bloom_filter, X_query, y_query, threshold, False) diff --git a/lgb_url_bruteforce_main.py b/lgb_url_autoLBF_main.py similarity index 52% rename from lgb_url_bruteforce_main.py rename to lgb_url_autoLBF_main.py index 20ffbed..1853fb0 100644 --- a/lgb_url_bruteforce_main.py +++ b/lgb_url_autoLBF_main.py @@ -1,3 +1,5 @@ +import time + import lightgbm as lgb import numpy as np import pandas as pd @@ -5,7 +7,6 @@ import lib.bf_util import lib.lgb_url - df_train = pd.read_csv('dataset/url_train.csv') df_test = pd.read_csv('dataset/url_test.csv') df_query = pd.read_csv('dataset/url_query.csv') @@ -24,7 +25,6 @@ train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) - # 设置参数 params = { 'objective': 'binary', @@ -33,9 +33,9 @@ 'learning_rate': 0.05, 'feature_fraction': 0.9, } -all_memory = 32 * 1024 # tweet模型大小:5 * 1024 * 1024 n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] +n_false = df_train[df_train['url_type'] == 0].shape[0] + df_test[df_test['url_type'] == 0].shape[0] n_test = len(df_test) @@ -44,37 +44,31 @@ def evaluate_thresholds(prediction_results, y_true, bf_bytes): sorted_predictions = prediction_results[sorted_indices] sorted_true = y_true[sorted_indices] - print(sorted_predictions) - - total_positives = np.sum(sorted_true) - print(f'total positives = {total_positives}') - total_negatives = len(sorted_true) - total_positives - print(f'total negatives = {total_negatives}') - - # fp = 0 - # tp = total_positives - fp = total_negatives + fp = n_false tp = 0 best_thresh = 0 best_fpr_lbf = 1.0 - unique_sorted_predictions = np.unique(sorted_predictions) + unique_sorted_predictions, idx = np.unique(sorted_predictions, return_index=True) - j = 0 - for i in range(len(unique_sorted_predictions)): + n = len(unique_sorted_predictions) + for i in range(n): thresh = unique_sorted_predictions[i] - while j < len(sorted_predictions) and sorted_predictions[j] == thresh: - if sorted_true[j] == 1: - tp += 1 - else: - fp -= 1 - j += 1 + if i < n - 1: + count_1 = np.sum(sorted_true[idx[i]:idx[i+1]]) + tp += count_1 + fp -= idx[i+1] - idx[i] - count_1 + else: + count_1 = np.sum(sorted_true[idx[i]:n]) + tp += count_1 + fp -= n - idx[i] - count_1 - # bf_count = tp * float(n_true) / n_test bf_count = tp fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) - fpr_lgb = fp / total_negatives + fpr_lgb = fp / n_false + if fp>n_false: + print(fp,n_false) fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf if fpr_lbf < best_fpr_lbf: @@ -85,6 +79,83 @@ def evaluate_thresholds(prediction_results, y_true, bf_bytes): return best_thresh, best_fpr_lbf +# def evaluate_thresholds(prediction_results, y_true, bf_bytes): +# sorted_indices = np.argsort(prediction_results) +# sorted_predictions = prediction_results[sorted_indices] +# sorted_true = y_true[sorted_indices] +# +# fp = n_false +# tp = 0 +# best_thresh = 0 +# best_fpr_lbf = 1.0 +# +# unique_sorted_predictions = np.unique(sorted_predictions) +# +# j = 0 +# for i in range(len(unique_sorted_predictions)): +# thresh = unique_sorted_predictions[i] +# +# while j < len(sorted_predictions) and sorted_predictions[j] == thresh: +# if sorted_true[j] == 1: +# tp += 1 +# else: +# fp -= 1 +# j += 1 +# +# bf_count = tp +# fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) +# fpr_lgb = fp / n_false +# fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf +# +# if fpr_lbf < best_fpr_lbf: +# best_thresh = thresh +# best_fpr_lbf = fpr_lbf +# +# print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') +# return best_thresh, best_fpr_lbf + + +# def evaluate_thresholds(prediction_results, y_true, bf_bytes): +# # 量化预测值并转换为整数 +# quantized_predictions = np.round(prediction_results * 1000).astype(int) +# +# # # 获取最大值和最小值以确定桶的范围 +# max_pred = np.max(quantized_predictions) +# min_pred = np.min(quantized_predictions) +# # +# # # 初始化桶的数量 +# num_buckets = max_pred - min_pred + 1 +# tp_count = np.zeros(num_buckets, dtype=int) +# fp_count = np.zeros(num_buckets, dtype=int) +# +# indices = quantized_predictions - min_pred +# np.add.at(tp_count, indices[y_true == 1], 1) +# np.add.at(fp_count, indices[y_true == 0], 1) +# best_thresh = 0 +# best_fpr_lbf = 1.0 +# +# tp = 0 +# fp = n_false +# for i in range(num_buckets): +# tp += tp_count[i] +# fp -= fp_count[i] +# +# bf_count = tp +# fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) +# fpr_lgb = fp / n_false +# fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf +# +# if fpr_lbf < best_fpr_lbf: +# best_thresh = (i + min_pred) / 1000.0 +# best_fpr_lbf = fpr_lbf +# +# print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') +# return best_thresh, best_fpr_lbf + + +start_time = time.perf_counter_ns() + +size = 320 * 1024 bst = None best_bst = None best_fpr = 1.0 @@ -95,7 +166,7 @@ def evaluate_thresholds(prediction_results, y_true, bf_bytes): best_epoch = 0 for i in range(int(epoch_max / epoch_each)): bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, keep_training_booster=True) - bf_bytes = all_memory - lib.lgb_url.lgb_get_model_size(bst) + bf_bytes = size - lib.lgb_url.lgb_get_model_size(bst) if bf_bytes <= 0: break # prediction_results = bst.predict(X_test) @@ -120,15 +191,18 @@ def evaluate_thresholds(prediction_results, y_true, bf_bytes): epoch_now += epoch_each +end_time = time.perf_counter_ns() +print(f'use {(end_time - start_time) / 1000000}ms') model_size = lib.lgb_url.lgb_get_model_size(best_bst) print("模型在内存中所占用的大小(字节):", model_size) print(f"best threshold:", best_threshold) print(f"best epoch:", best_epoch) -data_negative = lib.lgb_url.lgb_validate_url(best_bst, X_train, y_train, train_urls, X_test, y_test, test_urls, best_threshold) +data_negative = lib.lgb_url.lgb_validate_url(best_bst, X_train, y_train, train_urls, X_test, y_test, test_urls, + best_threshold) print(f"{len(data_negative)} insert into bloom filter") -bloom_size = all_memory - model_size +bloom_size = size - model_size bloom_filter = lib.lgb_url.create_bloom_filter(dataset=data_negative, bf_size=bloom_size) diff --git a/lgb_url_autoPLBF_main.py b/lgb_url_autoPLBF_main.py new file mode 100644 index 0000000..226c350 --- /dev/null +++ b/lgb_url_autoPLBF_main.py @@ -0,0 +1,99 @@ +import time + +import lightgbm as lgb +import numpy as np +import pandas as pd + +import lib.lgb_url +from plbf.FastPLBF_M import FastPLBF_M + +df_train = pd.read_csv('dataset/url_train.csv') +df_test = pd.read_csv('dataset/url_test.csv') +df_query = pd.read_csv('dataset/url_query.csv') + +# 筛选出正类样本的URLs +positive_train_urls = df_train[df_train['url_type'] == 1]['url'] +positive_test_urls = df_test[df_test['url_type'] == 1]['url'] + +# 合并正类样本的URLs +positive_urls = pd.concat([positive_train_urls, positive_test_urls]) + +# 转换成list +positive_urls_list = positive_urls.tolist() + +query_urls = df_query['url'] + +X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_train = df_train['url_type'].values.astype(np.float32) +X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_test = df_test['url_type'].values.astype(np.float32) +X_query = df_query.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_query = df_query['url_type'].values.astype(np.float32) + +positive_samples = np.concatenate((X_train[y_train == 1], X_test[y_test == 1]), axis=0) +negative_samples = np.concatenate((X_train[y_train == 0], X_test[y_test == 0]), axis=0) + +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +# 设置参数 +params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': 31, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, +} + +n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] +n_false = df_train[df_train['url_type'] == 0].shape[0] + df_test[df_test['url_type'] == 0].shape[0] +n_test = len(df_test) + +size = 64 * 1024 +bst = None +best_bst = None +best_fpr = 1.0 +epoch_each = 1 +epoch_now = 0 +epoch_max = 20 +best_epoch = 0 +best_plbf = None + +start_time = time.perf_counter_ns() +for i in range(int(epoch_max / epoch_each)): + bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, + keep_training_booster=True) + epoch_now += epoch_each + bf_bytes = size - lib.lgb_url.lgb_get_model_size(bst) + if bf_bytes <= 0: + break + if epoch_now < 2: + continue + + pos_scores = bst.predict(positive_samples).tolist() + neg_scores = bst.predict(negative_samples).tolist() + plbf = FastPLBF_M(positive_urls_list, pos_scores, neg_scores, bf_bytes * 8.0, 50, 5) + fpr = plbf.get_fpr() + if best_bst is None or fpr < best_fpr: + best_bst = bst.__copy__() + best_fpr = fpr + best_epoch = epoch_now + best_plbf = plbf + +end_time = time.perf_counter_ns() +print(f'use {(end_time - start_time) / 1000000}ms') + +model_size = lib.lgb_url.lgb_get_model_size(best_bst) +print("模型在内存中所占用的大小(字节):", model_size) +print(f"best epoch:", best_epoch) + +fp_cnt = 0 +query_negative = X_query +query_neg_keys = query_urls +query_neg_scores = best_bst.predict(X_query) +total = len(query_negative) + +for key, score in zip(query_neg_keys, query_neg_scores): + if best_plbf.contains(key, score): + fp_cnt += 1 +print(f"fpr: {float(fp_cnt) / total}") diff --git a/lgb_url_main.py b/lgb_url_main.py deleted file mode 100644 index 293b836..0000000 --- a/lgb_url_main.py +++ /dev/null @@ -1,117 +0,0 @@ -import lightgbm as lgb -import numpy as np -import pandas as pd -from bayes_opt import BayesianOptimization - -import lib.bf_util -import lib.lgb_url - - -df_train = pd.read_csv('dataset/url_train.csv') -df_test = pd.read_csv('dataset/url_test.csv') -df_query = pd.read_csv('dataset/url_query.csv') - -train_urls = df_train['url'] -test_urls = df_test['url'] -query_urls = df_query['url'] - -X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) -y_train = df_train['url_type'].values.astype(np.float32) -X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) -y_test = df_test['url_type'].values.astype(np.float32) -X_query = df_query.drop(columns=['url', 'url_type']).values.astype(np.float32) -y_query = df_query['url_type'].values.astype(np.float32) - -train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) -test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) - - -# 设置参数 -params = { - 'objective': 'binary', - 'metric': 'binary_logloss', - 'num_leaves': 31, - 'learning_rate': 0.05, - 'feature_fraction': 0.9, -} -all_memory = 256 * 1024 # tweet模型大小:5 * 1024 * 1024 - -n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] -n_test = len(df_test) - - -def evaluate_threshold(thresh, y_pred, y_true, bf_bytes, thresh_fpr_map): - y_pred_bin = (y_pred > thresh).astype(int) - fp_lgb = np.sum((y_pred_bin == 1) & (y_true == 0)) - bf_count = np.sum((y_pred_bin == 0) & (y_true == 1)) * float(n_true) / n_test - fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) - fpr_lgb = fp_lgb / np.sum(y_true == 0) - fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf - if thresh not in thresh_fpr_map or fpr_lbf < thresh_fpr_map[thresh]: - thresh_fpr_map[thresh] = fpr_lbf - return 1 - fpr_lbf - - -bst = None -best_bst = None -best_fpr = 1.0 -best_threshold = 0.5 -epoch_each = 2 -epoch_now = 0 -epoch_max = 20 -best_epoch = 0 -for i in range(int(epoch_max / epoch_each)): - bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, keep_training_booster=True) - prediction_results = bst.predict(X_test) - bf_bytes = all_memory - lib.lgb_url.lgb_get_model_size(bst) - if bf_bytes <= 0: - break - - thresh_fpr_map = {} - # 定义搜索空间 - pbounds = {'thresh': (0.0, 1.0)} - - # 定义优化器 - optimizer = BayesianOptimization( - f=lambda thresh: evaluate_threshold(thresh, prediction_results, y_test, bf_bytes, thresh_fpr_map), - pbounds=pbounds, - random_state=42, - allow_duplicate_points=True - ) - - # 进行优化 - optimizer.maximize(n_iter=20) - - # 最优阈值 - best_thresh = optimizer.max['params']['thresh'] - fpr_lbf = thresh_fpr_map[best_thresh] - if best_bst is None or fpr_lbf < best_fpr: - best_bst = bst.__copy__() - # bst.save_model('best_model.txt') - best_threshold = best_thresh - best_fpr = fpr_lbf - best_epoch = epoch_now - - epoch_now += epoch_each - thresh_fpr_map.clear() - - -model_size = lib.lgb_url.lgb_get_model_size(best_bst) -print("模型在内存中所占用的大小(字节):", model_size) -print(f"best threshold:", best_threshold) -print(f"best epoch:", best_epoch) - -data_negative = lib.lgb_url.lgb_validate(best_bst, X_train, y_train, train_urls, X_test, y_test, test_urls, best_threshold) -bloom_size = all_memory - model_size - -bloom_filter = lib.lgb_url.create_bloom_filter(dataset=data_negative, bf_size=bloom_size) - -# 访问布隆过滤器的 num_bits 属性 -num_bits = bloom_filter.num_bits - -# 将比特位转换为字节(8 bits = 1 byte) -memory_in_bytes = num_bits / 8 -print("memory of bloom filter: ", memory_in_bytes) -print("memory of learned model: ", model_size) - -fpr = lib.lgb_url.lgb_query(best_bst, bloom_filter, X_query, y_query, query_urls, best_threshold, False) diff --git a/lgb_yelp_autoLBF_main.py b/lgb_yelp_autoLBF_main.py new file mode 100644 index 0000000..1728ca8 --- /dev/null +++ b/lgb_yelp_autoLBF_main.py @@ -0,0 +1,258 @@ +import time + +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.bf_util +import lib.lgb_url + +data_train = pd.read_csv('dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num + + +X_train, y_train, train_insert, train_true, train_false = yelp_embedding(data_train, word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false = yelp_embedding(data_test, word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false = yelp_embedding(data_query, word_dict=word_dict, + region_dict=region_dict) +print(query_insert) +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + +# 设置参数 +params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': 31, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, + 'verbose': -1 +} + + +def evaluate_thresholds(prediction_results, y_true, bf_bytes): + sorted_indices = np.argsort(prediction_results) + sorted_predictions = prediction_results[sorted_indices] + sorted_true = y_true[sorted_indices] + + fp = n_false + tp = 0 + best_thresh = 0 + best_fpr_lbf = 1.0 + + unique_sorted_predictions, idx = np.unique(sorted_predictions, return_index=True) + + n = len(unique_sorted_predictions) + for i in range(n): + thresh = unique_sorted_predictions[i] + + if i < n - 1: + count_1 = np.sum(sorted_true[idx[i]:idx[i + 1]]) + tp += count_1 + fp -= idx[i + 1] - idx[i] - count_1 + else: + count_1 = np.sum(sorted_true[idx[i]:n]) + tp += count_1 + fp -= n - idx[i] - count_1 + + bf_count = tp + fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) + fpr_lgb = fp / n_false + + fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf + + if fpr_lbf < best_fpr_lbf: + best_thresh = thresh + best_fpr_lbf = fpr_lbf + + print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') + return best_thresh, best_fpr_lbf + + +# def evaluate_thresholds(prediction_results, y_true, bf_bytes): +# sorted_indices = np.argsort(prediction_results) +# sorted_predictions = prediction_results[sorted_indices] +# sorted_true = y_true[sorted_indices] +# +# fp = n_false +# tp = 0 +# best_thresh = 0 +# best_fpr_lbf = 1.0 +# +# unique_sorted_predictions = np.unique(sorted_predictions) +# +# j = 0 +# for i in range(len(unique_sorted_predictions)): +# thresh = unique_sorted_predictions[i] +# +# while j < len(sorted_predictions) and sorted_predictions[j] == thresh: +# if sorted_true[j] == 1: +# tp += 1 +# else: +# fp -= 1 +# j += 1 +# +# bf_count = tp +# fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) +# fpr_lgb = fp / n_false +# fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf +# +# if fpr_lbf < best_fpr_lbf: +# best_thresh = thresh +# best_fpr_lbf = fpr_lbf +# +# print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') +# return best_thresh, best_fpr_lbf + + +# def evaluate_thresholds(prediction_results, y_true, bf_bytes): +# # 量化预测值并转换为整数 +# quantized_predictions = np.round(prediction_results * 1000).astype(int) +# +# # # 获取最大值和最小值以确定桶的范围 +# max_pred = np.max(quantized_predictions) +# min_pred = np.min(quantized_predictions) +# # +# # # 初始化桶的数量 +# num_buckets = max_pred - min_pred + 1 +# tp_count = np.zeros(num_buckets, dtype=int) +# fp_count = np.zeros(num_buckets, dtype=int) +# +# indices = quantized_predictions - min_pred +# np.add.at(tp_count, indices[y_true == 1], 1) +# np.add.at(fp_count, indices[y_true == 0], 1) +# best_thresh = 0 +# best_fpr_lbf = 1.0 +# +# tp = 0 +# fp = n_false +# for i in range(num_buckets): +# tp += tp_count[i] +# fp -= fp_count[i] +# +# bf_count = tp +# fpr_bf = lib.bf_util.get_fpr(bf_count, bf_bytes) +# fpr_lgb = fp / n_false +# fpr_lbf = fpr_lgb + (1 - fpr_lgb) * fpr_bf +# +# if fpr_lbf < best_fpr_lbf: +# best_thresh = (i + min_pred) / 1000.0 +# best_fpr_lbf = fpr_lbf +# +# print(f'best thresh = {best_thresh} and best fpr = {best_fpr_lbf}') +# return best_thresh, best_fpr_lbf + + +start_time = time.perf_counter_ns() +initial_size = 64 * 1024 +max_size = 320 * 1024 + +# 循环,从32开始,每次乘以2,直到512 +size = initial_size +while size <= max_size: + print(f'size {size}') + bst = None + best_bst = None + best_fpr = 1.0 + best_threshold = 0.5 + epoch_each = 1 + epoch_now = 0 + epoch_max = 20 + best_epoch = 0 + for i in range(int(epoch_max / epoch_each)): + bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, + keep_training_booster=True) + bf_bytes = size - lib.lgb_url.lgb_get_model_size(bst) + if bf_bytes <= 0: + break + # prediction_results = bst.predict(X_test) + + # 对训练集进行预测 + train_pred = bst.predict(X_train) + test_pred = bst.predict(X_test) + + # 拼接预测结果 + all_predictions = np.concatenate([train_pred, test_pred]) + all_true_labels = np.concatenate([y_train, y_test]) + best_thresh, best_fpr_lbf = evaluate_thresholds(all_predictions, all_true_labels, bf_bytes) + + # best_thresh, best_fpr_lbf = evaluate_thresholds(prediction_results, y_test, bf_bytes) + + # 保存最佳模型 + if best_bst is None or best_fpr_lbf < best_fpr: + best_bst = bst.__copy__() + best_threshold = best_thresh + best_fpr = best_fpr_lbf + best_epoch = epoch_now + + epoch_now += epoch_each + + end_time = time.perf_counter_ns() + print(f'use {(end_time - start_time) / 1000000}ms') + + model_size = lib.lgb_url.lgb_get_model_size(best_bst) + print("模型在内存中所占用的大小(字节):", model_size) + print(f"best threshold:", best_threshold) + print(f"best epoch:", best_epoch) + + data_negative = lib.lgb_url.lgb_validate_url(best_bst, X_train, y_train, train_insert, X_test, y_test, test_insert, + best_threshold) + print(f"{len(data_negative)} insert into bloom filter") + bloom_size = size - model_size + + bloom_filter = lib.lgb_url.create_bloom_filter(dataset=data_negative, bf_size=bloom_size) + + # 访问布隆过滤器的 num_bits 属性 + num_bits = bloom_filter.num_bits + + # 将比特位转换为字节(8 bits = 1 byte) + memory_in_bytes = num_bits / 8 + print("memory of bloom filter: ", memory_in_bytes) + print("memory of learned model: ", model_size) + + fpr = lib.lgb_url.lgb_query_url(best_bst, bloom_filter, X_query, y_query, query_insert, best_threshold, False) + size = size + 64 * 1024 diff --git a/lgb_yelp_autoPLBF_main.py b/lgb_yelp_autoPLBF_main.py new file mode 100644 index 0000000..2d9244d --- /dev/null +++ b/lgb_yelp_autoPLBF_main.py @@ -0,0 +1,137 @@ +import time + +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.lgb_url +from plbf.FastPLBF_M import FastPLBF_M + +data_train = pd.read_csv('dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + positive_insert = insert[data_train['is_in'] == 1] + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num, positive_insert + + +X_train, y_train, train_insert, train_true, train_false, train_positive_insert = yelp_embedding(data_train, + word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false, test_positive_insert = yelp_embedding(data_test, + word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false, query_positive_insert = yelp_embedding(data_query, + word_dict=word_dict, + region_dict=region_dict) +print(train_positive_insert) +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + +positive_samples = np.concatenate((X_train[y_train == 1], X_test[y_test == 1]), axis=0) +negative_samples = np.concatenate((X_train[y_train == 0], X_test[y_test == 0]), axis=0) + + +# 合并正类样本的URLs +positive_urls = pd.concat([train_positive_insert, test_positive_insert]) + +# 转换成list +positive_urls_list = positive_urls.tolist() + +# 设置参数 +params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': 31, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, +} + +size = 320 * 1024 +bst = None +best_bst = None +best_fpr = 1.0 +epoch_each = 1 +epoch_now = 0 +epoch_max = 20 +best_epoch = 0 +best_plbf = None + +start_time = time.perf_counter_ns() +for i in range(int(epoch_max / epoch_each)): + bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, + keep_training_booster=True) + epoch_now += epoch_each + bf_bytes = size - lib.lgb_url.lgb_get_model_size(bst) + if bf_bytes <= 0: + break + if epoch_now < 2: + continue + + pos_scores = bst.predict(positive_samples).tolist() + neg_scores = bst.predict(negative_samples).tolist() + plbf = FastPLBF_M(positive_urls_list, pos_scores, neg_scores, bf_bytes * 8.0, 50, 5) + fpr = plbf.get_fpr() + if best_bst is None or fpr < best_fpr: + best_bst = bst.__copy__() + best_fpr = fpr + best_epoch = epoch_now + best_plbf = plbf + +end_time = time.perf_counter_ns() +print(f'use {(end_time - start_time) / 1000000}ms') + +model_size = lib.lgb_url.lgb_get_model_size(best_bst) +print("模型在内存中所占用的大小(字节):", model_size) +print(f"best epoch:", best_epoch) + +fp_cnt = 0 +query_negative = X_query +query_neg_keys = query_insert +query_neg_scores = best_bst.predict(X_query) +total = len(query_negative) + +for key, score in zip(query_neg_keys, query_neg_scores): + if best_plbf.contains(key, score): + fp_cnt += 1 +print(f"fpr: {float(fp_cnt) / total}") diff --git a/lgb_url_lbf_main.py b/lgb_yelp_lbf_main.py similarity index 64% rename from lgb_url_lbf_main.py rename to lgb_yelp_lbf_main.py index fd079a0..b761b8f 100644 --- a/lgb_url_lbf_main.py +++ b/lgb_yelp_lbf_main.py @@ -2,29 +2,56 @@ import numpy as np import pandas as pd from sklearn.metrics import log_loss - +import lib.data_processing +import lib.network import lib.lgb_url import lib.bf_util -df_train = pd.read_csv('dataset/url_train.csv') -df_test = pd.read_csv('dataset/url_test.csv') -df_query = pd.read_csv('dataset/url_query.csv') +data_train = pd.read_csv('dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) -train_urls = df_train['url'] -test_urls = df_test['url'] -query_urls = df_query['url'] + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert -X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) -y_train = df_train['url_type'].values.astype(np.float32) -X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) -y_test = df_test['url_type'].values.astype(np.float32) -X_query = df_query.drop(columns=['url', 'url_type']).values.astype(np.float32) -y_query = df_query['url_type'].values.astype(np.float32) +X_train, y_train, train_insert = yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict) +X_test, y_test, test_insert = yelp_embedding(data_test, word_dict=word_dict, region_dict=region_dict) +X_query, y_query, query_insert = yelp_embedding(data_query, word_dict=word_dict, region_dict=region_dict) train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) -n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] -n_test = len(df_test) +n_true = data_train[data_train['is_in'] == 1].shape[0] + data_test[data_test['is_in'] == 1].shape[0] +n_test = len(data_test) best_params = None best_score = float('inf') @@ -115,7 +142,7 @@ def evaluate_threshold(thresh, y_pred, y_true, bf_bytes): best_score = score best_threshold = threshold - data_negative = lib.lgb_url.lgb_validate_url(bst, X_train, y_train, train_urls, X_test, y_test, test_urls, + data_negative = lib.lgb_url.lgb_validate_url(bst, X_train, y_train, train_insert, X_test, y_test, test_insert, best_threshold) bloom_filter = lib.lgb_url.create_bloom_filter(dataset=data_negative, bf_size=bloom_size) @@ -127,5 +154,5 @@ def evaluate_threshold(thresh, y_pred, y_true, bf_bytes): print("memory of bloom filter: ", memory_in_bytes) print("memory of learned model: ", model_size) - fpr = lib.lgb_url.lgb_query_url(bst, bloom_filter, X_query, y_query, query_urls, best_threshold, False) + fpr = lib.lgb_url.lgb_query_url(bst, bloom_filter, X_query, y_query, query_insert, best_threshold, False) size *= 2 diff --git a/lib/__pycache__/bf_util.cpython-311.pyc b/lib/__pycache__/bf_util.cpython-311.pyc index 1480502..35b68c7 100644 Binary files a/lib/__pycache__/bf_util.cpython-311.pyc and b/lib/__pycache__/bf_util.cpython-311.pyc differ diff --git a/lib/__pycache__/data_processing.cpython-311.pyc b/lib/__pycache__/data_processing.cpython-311.pyc index 3599604..f5365ee 100644 Binary files a/lib/__pycache__/data_processing.cpython-311.pyc and b/lib/__pycache__/data_processing.cpython-311.pyc differ diff --git a/lib/__pycache__/lgb_url.cpython-311.pyc b/lib/__pycache__/lgb_url.cpython-311.pyc new file mode 100644 index 0000000..55b6209 Binary files /dev/null and b/lib/__pycache__/lgb_url.cpython-311.pyc differ diff --git a/lib/__pycache__/network.cpython-311.pyc b/lib/__pycache__/network.cpython-311.pyc index cae38f5..7249848 100644 Binary files a/lib/__pycache__/network.cpython-311.pyc and b/lib/__pycache__/network.cpython-311.pyc differ diff --git a/lib/__pycache__/network_higgs.cpython-311.pyc b/lib/__pycache__/network_higgs.cpython-311.pyc new file mode 100644 index 0000000..af70300 Binary files /dev/null and b/lib/__pycache__/network_higgs.cpython-311.pyc differ diff --git a/lib/__pycache__/network_url.cpython-311.pyc b/lib/__pycache__/network_url.cpython-311.pyc index 1cf4f94..74b2c50 100644 Binary files a/lib/__pycache__/network_url.cpython-311.pyc and b/lib/__pycache__/network_url.cpython-311.pyc differ diff --git a/lib/bf_util.py b/lib/bf_util.py index 8742ac5..f0e4f10 100644 --- a/lib/bf_util.py +++ b/lib/bf_util.py @@ -10,8 +10,8 @@ def get_fpr(n_items, bf_size): def create_bloom_filter(dataset, bf_size): n_items = len(dataset) - print('n_items = ', n_items) - print('bf_size = ', bf_size) + # print('n_items = ', n_items) + # print('bf_size = ', bf_size) # # 创建布隆过滤器 bloom_filter = BloomFilter(capacity=max(1, n_items), error_rate=get_fpr(n_items, bf_size)) @@ -19,3 +19,14 @@ def create_bloom_filter(dataset, bf_size): bloom_filter.add(data) return bloom_filter + + +def create_bloom_filter_in_bits(dataset, bf_size): + n_items = len(dataset) + + # # 创建布隆过滤器 + bloom_filter = BloomFilter(capacity=max(1, n_items), error_rate=max(1e-8, 0.5 ** (bf_size * math.log(2) / n_items))) + for data in dataset: + bloom_filter.add(data) + + return bloom_filter diff --git a/lib/data_processing.py b/lib/data_processing.py index 6175389..893c851 100644 --- a/lib/data_processing.py +++ b/lib/data_processing.py @@ -37,7 +37,7 @@ def cal_region_id(lon, lat, x_min=27, x_max=54, y_min=-120, y_max=-74, one_kilo= def loading_embedding(dataset): # 加载区域的编码信息 - data = pd.read_csv('embedding/region_embedding.csv') + data = pd.read_csv('../embedding/region_embedding_new.csv') data['Merged_List'] = np.array(data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']].apply( lambda x: x.tolist(), axis=1)) @@ -50,7 +50,7 @@ def loading_embedding(dataset): # 读取训练的关键字embedding word_dict = {} - with open('embedding/' + dataset + '_keywords_embedding.csv', newline='', encoding='utf-8') as csvfile: + with open('../embedding/' + dataset + '_keywords_embedding.csv', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter=',') next(reader) # 遍历csv文件中的每一行并将其添加到字典中 diff --git a/lib/network.py b/lib/network.py index 39ab5f5..abeaf66 100644 --- a/lib/network.py +++ b/lib/network.py @@ -245,6 +245,7 @@ def keywords_embedding(keyword, word_dict): # cnt用于记录没找到对应的关键字的 # cnt+=1 # print(cnt) + print("not found") return np.zeros(300) @@ -257,7 +258,7 @@ def insert(ck): time_bucket = time.hour * 2 + time.minute // 30 time = str(time.year) + str(time.month).zfill(2) + str(time.day).zfill(2) + str(time_bucket).zfill(2) # print(time) - region_id = str(cal_region_id(lat, lon)).zfill(8) + region_id = str(cal_region_id(lat=lat, lon=lon)).zfill(8) try: keywords = keywords.replace(" ", "") except AttributeError: @@ -269,7 +270,10 @@ def to_embedding(d): region = np.array(d['region']) time = np.array(d['timestamp']) keywords = np.array(d['keywords']) - embedding = torch.tensor(np.concatenate((time, region, keywords)), dtype=torch.float32) + #print(keywords) + # embedding = torch.tensor(np.concatenate((time, region, keywords)), dtype=torch.float32) + embedding = np.concatenate((time, region, keywords)) + # print(embedding.shape) return embedding diff --git a/lgb_higgs_bruteforce_main.py b/lightGBM_yelp.py similarity index 50% rename from lgb_higgs_bruteforce_main.py rename to lightGBM_yelp.py index 2f35298..42a5aaf 100644 --- a/lgb_higgs_bruteforce_main.py +++ b/lightGBM_yelp.py @@ -1,25 +1,92 @@ import lightgbm as lgb -import numpy as np import pandas as pd - -import lib.bf_util +import lib.network +import lib.data_processing +import numpy as np import lib.lgb_url +import lib.bf_util +''' +df = pd.read_csv('dataset/yelp/query_data.csv') -df_train = pd.read_csv('dataset/higgs_train.csv') -df_test = pd.read_csv('dataset/higgs_test.csv') -df_query = pd.read_csv('dataset/higgs_query.csv') +# 将标签(1)和标签(0)分别过滤出来 +df_1 = df[df['is_in'] == 1] +df_0 = df[df['is_in'] == 0] -X_train = df_train.drop(columns=['Label']).values.astype(np.float32) -y_train = df_train['Label'].values.astype(np.float32) -X_test = df_test.drop(columns=['Label']).values.astype(np.float32) -y_test = df_test['Label'].values.astype(np.float32) -X_query = df_query.drop(columns=['Label']).values.astype(np.float32) -y_query = df_query['Label'].values.astype(np.float32) +# 从标签为'b'的样本中随机抽取和's'标签相同数量的样本 +df_1_sample = df_1.sample(frac=0.8, random_state=42) +df_0_sample = df_0.sample(frac=0.8, random_state=42) + +# 合并得到训练集+测试集 +df_train_test = pd.concat([df_1_sample, df_0_sample]) + +# 剩下的标签为'b'的样本作为查询集 +df_query = df_0.drop(df_0_sample.index) + +df_train, df_test = train_test_split(df_train_test, test_size=0.2, random_state=42) + + + +df_train.to_csv('dataset/yelp/yelp_train.csv', index=False) +df_test.to_csv('dataset/yelp/yelp_test.csv', index=False) +df_query.to_csv('dataset/yelp/yelp_query.csv', index=False) +''' + +data_train = pd.read_csv('dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") -train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) -test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + #print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + #print(X) + return X, y, insert + + +X_train, y_train, train_insert = yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict) +X_test, y_test, test_insert = yelp_embedding(data_test, word_dict=word_dict, region_dict=region_dict) +X_query, y_query, query_insert = yelp_embedding(data_query, word_dict=word_dict, region_dict=region_dict) + +n_true = data_train[data_train['is_in'] == 1].shape[0] + data_test[data_test['is_in'] == 1].shape[0] +n_test = len(data_test) +# 清理内存 + + +# 3. 划分训练集和测试集 +X_train = X_train.values.astype(np.float32) +X_test = X_test.values.astype(np.float32) +y_train = y_train.values.astype(np.float32) +y_test = y_test.values.astype(np.float32) +# 4. 创建 LightGBM 数据集 +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, free_raw_data=False) # 设置参数 params = { @@ -29,10 +96,7 @@ 'learning_rate': 0.05, 'feature_fraction': 0.9, } -all_memory = 8 * 1024 # tweet模型大小:5 * 1024 * 1024 - -n_true = df_train[df_train['Label'] == 1].shape[0] + df_test[df_test['Label'] == 1].shape[0] -n_test = len(df_test) +all_memory = 512 * 1024 # tweet模型大小:5 * 1024 * 1024 def evaluate_thresholds(prediction_results, y_true, bf_bytes): @@ -89,6 +153,7 @@ def evaluate_thresholds(prediction_results, y_true, bf_bytes): epoch_now = 0 epoch_max = 20 best_epoch = 0 + for i in range(int(epoch_max / epoch_each)): bst = lgb.train(params, train_data, epoch_each, valid_sets=[test_data], init_model=bst, keep_training_booster=True) bf_bytes = all_memory - lib.lgb_url.lgb_get_model_size(bst) @@ -116,13 +181,13 @@ def evaluate_thresholds(prediction_results, y_true, bf_bytes): epoch_now += epoch_each - model_size = lib.lgb_url.lgb_get_model_size(best_bst) print("模型在内存中所占用的大小(字节):", model_size) print(f"best threshold:", best_threshold) print(f"best epoch:", best_epoch) -data_negative = lib.lgb_url.lgb_validate(best_bst, X_train, y_train, X_test, y_test, best_threshold) +data_negative = lib.lgb_url.lgb_validate_url(best_bst, X_train, y_train, train_insert, X_test, y_test, test_insert, + best_threshold) print(f"{len(data_negative)} insert into bloom filter") bloom_size = all_memory - model_size @@ -136,4 +201,4 @@ def evaluate_thresholds(prediction_results, y_true, bf_bytes): print("memory of bloom filter: ", memory_in_bytes) print("memory of learned model: ", model_size) -fpr = lib.lgb_url.lgb_query(best_bst, bloom_filter, X_query, y_query, best_threshold, False) +fpr = lib.lgb_url.lgb_query_url(best_bst, bloom_filter, X_query, y_query, query_insert, best_threshold, False) diff --git a/main_higgs.py b/main_higgs.py deleted file mode 100644 index b7ed73b..0000000 --- a/main_higgs.py +++ /dev/null @@ -1,132 +0,0 @@ -import numpy as np -import pandas as pd -import torch -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from torch.utils.data import Dataset, DataLoader - -import lib.network_higgs - - -class HIGGSDataset(Dataset): - def __init__(self, X_, y_): - self.X = torch.tensor(X_) - self.y = torch.tensor(y_) - - def __len__(self): - return len(self.X) - - def __getitem__(self, idx): - return self.X[idx], self.y[idx] - - -# 加载数据集 -df = pd.read_csv('E:\\python\\higgs\\training\\training.csv') - -# 标签编码 -df['Label'] = df['Label'].apply(lambda x: 1 if x == 's' else 0) - - -# 数据预处理 -# def check_multicollinearity(df, threshold=0.7): -# df = pd.DataFrame(df) # Convert dataset to a DataFrame if needed -# numeric_cols = df.select_dtypes(include=[np.number]).columns -# df_numeric = df[numeric_cols] -# corr_matrix = df_numeric.corr().abs() # Calculate the correlation matrix -# cols = corr_matrix.columns -# multicollinear_features = set() -# -# for i in range(len(cols)): -# for j in range(i + 1, len(cols)): -# if corr_matrix.iloc[i, j] >= threshold: -# multicollinear_features.add(cols[i]) -# multicollinear_features.add(cols[j]) -# return multicollinear_features - - -# multicollinear_cols = check_multicollinearity(df) -# df.drop(multicollinear_cols, axis=1, inplace=True) - -# 将's'标签(1)和'b'标签(0)分别过滤出来 -df_s = df[df['Label'] == 1] -df_b = df[df['Label'] == 0] - -# 取出标签为's'的所有样本数 -s_count = len(df_s) - -# 从标签为'b'的样本中随机抽取和's'标签相同数量的样本 -df_b_sample = df_b.sample(n=s_count, random_state=42) - -# 合并得到训练集(1:1比例) -train_df = pd.concat([df_s, df_b_sample]) - -# 剩下的标签为'b'的样本作为测试集 -validate_df = df_b.drop(df_b_sample.index) - -# 如果需要拆分特征和标签,可以如下进行 -X = train_df.drop('Label', axis=1).values.astype(np.float32) -y = train_df['Label'].values.astype(np.float32) -X_query = validate_df.drop('Label', axis=1).values.astype(np.float32) -y_query = validate_df['Label'].values.astype(np.float32) - -# # 数据预处理 -# X = df.drop(columns=['Label']).values.astype(np.float32) -# y = df['Label'].values.astype(np.float32) -# -# # 数据标准化 -scaler = StandardScaler() -X = scaler.fit_transform(X) -X_query = scaler.fit_transform(X_query) -# -# # 划分训练集和测试集 -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) - -train_dataset = HIGGSDataset(X_train, y_train) -test_dataset = HIGGSDataset(X_test, y_test) -train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) -test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) - -input_dim = X_train.shape[1] -print('input_dim = ', input_dim) -output_dim = 1 -all_memory = 15 * 1024 # tweet模型大小:5 * 1024 * 1024 -all_record = df.size -learning_rate = 0.001 -hidden_units = (8, 96) - -# nas_opt = lib.network_higgs.Bayes_Optimizer(input_dim=input_dim, output_dim=output_dim, train_loader=train_loader, -# val_loader=test_loader, learning_rate=learning_rate, -# hidden_units=hidden_units, all_record=all_record, all_memory=all_memory) -# model = nas_opt.optimize() -# print("has optimized") -# lib.network_higgs.train(model, train_loader=train_loader, num_epochs=30, val_loader=test_loader) -# torch.save(model, 'best_higgs_model_15.pth') - -# model = torch.load('best_higgs_model_15.pth') - -model = lib.network_higgs.SimpleNetwork([64], input_dim=input_dim, output_dim=output_dim) -lib.network_higgs.train(model, train_loader=train_loader, val_loader=test_loader, num_epochs=30) -# print(lib.network_higgs.get_model_size(model)) - -data_negative = lib.network_higgs.validate(model, X_train, y_train, X_test, y_test) - -model.eval() - -# 获得学习模型的内存大小 -model_size = lib.network_higgs.get_model_size(model) -bloom_size = all_memory - model_size - -bloom_filter = lib.network_higgs.create_bloom_filter(dataset=data_negative, bf_name='best_higgs_bf_3000', - bf_size=bloom_size) -# with open('best_higgs_bf_3000', 'rb') as bf_file: -# bloom_filter = pickle.load(bf_file) - -# 访问布隆过滤器的 num_bits 属性 -num_bits = bloom_filter.num_bits - -# 将比特位转换为字节(8 bits = 1 byte) -memory_in_bytes = num_bits / 8 -print("memory of bloom filter: ", memory_in_bytes) -print("memory of learned model: ", model_size) - -lib.network_higgs.query(model, bloom_filter, X_query, y_query) diff --git a/main_higgs_bf.py b/main_higgs_bf.py deleted file mode 100644 index 3b2beee..0000000 --- a/main_higgs_bf.py +++ /dev/null @@ -1,84 +0,0 @@ -import numpy as np -import pandas as pd -from sklearn.model_selection import train_test_split - -import lib.bf_util - -# run these code to generate dataset - -# # # 加载数据集 -# df = pd.read_csv('dataset/higgs.csv') -# -# # 将标签(1)和标签(0)分别过滤出来 -# df_1 = df[df['Label'] == 's'] -# df_0 = df[df['Label'] == 'b'] -# print(len(df_1)) # 85667 -# print(len(df_0)) # 164333 -# -# df_0_sample = df_0.sample(frac=0.8, random_state=42) -# -# # 合并得到训练集+测试集 -# df_train_test = pd.concat([df_1, df_0_sample]) -# # 剩下的标签为'b'的样本作为查询集 -# df_query = df_0.drop(df_0_sample.index) -# -# df_train_test['Label'] = df_train_test['Label'].replace({'s': 1, 'b': 0}) -# df_query['Label'] = df_query['Label'].replace({'s': 1, 'b': 0}) -# -# df_train, df_test = train_test_split(df_train_test, test_size=0.2, random_state=42) -# -# # 确保转换后的标签正确无误 -# print(df_train['Label'].unique()) -# print(df_test['Label'].unique()) -# print(df_query['Label'].unique()) -# -# df_train.to_csv('dataset/higgs_train.csv', index=False) -# df_test.to_csv('dataset/higgs_test.csv', index=False) -# df_query.to_csv('dataset/higgs_query.csv', index=False) - - -df_train = pd.read_csv('dataset/higgs_train.csv') -df_test = pd.read_csv('dataset/higgs_test.csv') -df_query = pd.read_csv('dataset/higgs_query.csv') - -# 获取训练集中url_type为1的行的索引 -id_train = df_train[df_train['Label'] == 1].index.tolist() - -# 获取测试集中url_type为1的行的索引 -id_test = df_test[df_test['Label'] == 1].index.tolist() - -# 组合训练集和测试集的url_type为1的url数据 -combined_data = pd.concat([df_train.loc[id_train], df_test.loc[id_test]], axis=0) - -# 定义布隆过滤器初始大小 -initial_size = 1 -max_size = 16 - -# 循环,从32开始,每次乘以2,直到256 -size = initial_size -while size <= max_size: - bloom_size = size * 1024 - bloom_filter = lib.bf_util.create_bloom_filter(dataset=combined_data, bf_size=bloom_size) - - # 统计假阳性率 - fp = 0 - fn = 0 - total_neg = 0 - # 遍历df_query中的每一个url列来查询布隆过滤器 - for index, row in df_query.iterrows(): - data = row.drop(labels={'Label'}) - # print(f'data = {data}') - true_label = row['Label'] - - if true_label == 0: - total_neg += 1 - if data in bloom_filter: - fp = fp + 1 - else: - print('contain positive query') - if data not in bloom_filter: - fn = fn + 1 - print(f'error for data {data}') - - print(f'fpr: {fp / total_neg}') - size *= 2 diff --git a/main_url.py b/main_url.py deleted file mode 100644 index 53f9b01..0000000 --- a/main_url.py +++ /dev/null @@ -1,133 +0,0 @@ -import numpy as np -import pandas as pd -import torch -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from torch.utils.data import Dataset, DataLoader - -import lib.network_url - - -class HIGGSDataset(Dataset): - def __init__(self, X_, y_): - self.X = torch.tensor(X_) - self.y = torch.tensor(y_) - - def __len__(self): - return len(self.X) - - def __getitem__(self, idx): - return self.X[idx], self.y[idx] - - -# 加载数据集 -df = pd.read_csv('dataset/train.csv') - -# 标签编码 -# df['Label'] = df['Label'].apply(lambda x: 1 if x == 's' else 0) - -# 数据预处理 -# def check_multicollinearity(df, threshold=0.7): -# df = pd.DataFrame(df) # Convert dataset to a DataFrame if needed -# numeric_cols = df.select_dtypes(include=[np.number]).columns -# df_numeric = df[numeric_cols] -# corr_matrix = df_numeric.corr().abs() # Calculate the correlation matrix -# cols = corr_matrix.columns -# multicollinear_features = set() -# -# for i in range(len(cols)): -# for j in range(i + 1, len(cols)): -# if corr_matrix.iloc[i, j] >= threshold: -# multicollinear_features.add(cols[i]) -# multicollinear_features.add(cols[j]) -# return multicollinear_features - - -# multicollinear_cols = check_multicollinearity(df) -# df.drop(multicollinear_cols, axis=1, inplace=True) - -# 将's'标签(1)和'b'标签(0)分别过滤出来 -df_s = df[df['url_type'] == 1] -df_b = df[df['url_type'] == 0] -# 取出标签为's'的所有样本数 -s_count = len(df_s) - -# 从标签为'b'的样本中随机抽取和's'标签相同数量的样本 -df_b_sample = df_b.sample(n=s_count, random_state=42) - -# 合并得到训练集(1:1比例) -train_df = pd.concat([df_s, df_b_sample]) - -# 剩下的标签为'b'的样本作为测试集 -validate_df = df_b.drop(df_b_sample.index) -print(validate_df) -# 如果需要拆分特征和标签,可以如下进行 -X = train_df.drop('url_type', axis=1).values.astype(np.float32) -y = train_df['url_type'].values.astype(np.float32) -X_query = validate_df.drop('url_type', axis=1).values.astype(np.float32) -y_query = validate_df['url_type'].values.astype(np.float32) - -# # 数据预处理 -# X = df.drop(columns=['Label']).values.astype(np.float32) -# y = df['Label'].values.astype(np.float32) -# -# # 数据标准化 -scaler = StandardScaler() -X = scaler.fit_transform(X) -X_query = scaler.fit_transform(X_query) -# -# # 划分训练集和测试集 -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - -train_dataset = HIGGSDataset(X_train, y_train) -test_dataset = HIGGSDataset(X_test, y_test) -train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) -test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) - -input_dim = X_train.shape[1] -print('input_dim = ', input_dim) -print('test dataset size = ', len(test_dataset)) -output_dim = 1 -all_memory = 128 * 1024 # tweet模型大小:5 * 1024 * 1024 -all_record = df.size -learning_rate = 0.001 -hidden_units = (8, 512) - -# nas_opt = lib.network_url.Bayes_Optimizer(input_dim=input_dim, output_dim=output_dim, train_loader=train_loader, -# val_loader=test_loader, true_data_count=len(df_s), -# hidden_units=hidden_units, all_record=all_record, all_memory=all_memory) -# model = nas_opt.optimize() -# print("has optimized") -# lib.network_url.train(model, train_loader=train_loader, n_true=len(df_s), -# bf_memory=all_memory-lib.network_url.get_model_size(model), num_epochs=30, -# val_loader=test_loader) -# torch.save(model, 'best_url_model.pth') -# -model = lib.network_url.SimpleNetwork([8, 32, 8], input_dim=input_dim, output_dim=output_dim) -lib.network_url.train(model, train_loader=train_loader, bf_memory=all_memory - lib.network_url.get_model_size(model), - n_true=len(df_s), val_loader=test_loader, num_epochs=30) - -# lib.network_url.train_with_fpr_1(model, train_loader=train_loader, -# bf_memory=all_memory - lib.network_url.get_model_size(model), -# true_data_count=len(df_s), val_loader=test_loader, num_epochs=30) - -data_negative = lib.network_url.validate(model, X_train, y_train, X_test, y_test) - -model.eval() - -# 获得学习模型的内存大小 -model_size = lib.network_url.get_model_size(model) -bloom_size = all_memory - model_size - -bloom_filter = lib.network_url.create_bloom_filter(dataset=data_negative, bf_name='best_higgs_bf_3000', - bf_size=bloom_size) - -# 访问布隆过滤器的 num_bits 属性 -num_bits = bloom_filter.num_bits - -# 将比特位转换为字节(8 bits = 1 byte) -memory_in_bytes = num_bits / 8 -print("memory of bloom filter: ", memory_in_bytes) -print("memory of learned model: ", model_size) - -fpr = lib.network_url.query(model, bloom_filter, X_query, y_query) diff --git a/main_yelp_bf.py b/main_yelp_bf.py new file mode 100644 index 0000000..b148a5a --- /dev/null +++ b/main_yelp_bf.py @@ -0,0 +1,93 @@ +import numpy as np +import pandas as pd +import lib.network +import lib.bf_util + +# run these code to generate dataset +""" +# 加载数据集 +df = pd.read_csv('dataset/url.csv') + +# 将标签(1)和标签(0)分别过滤出来 +df_1 = df[df['url_type'] == 1] +df_0 = df[df['url_type'] == 0] + +# 从标签为'b'的样本中随机抽取和's'标签相同数量的样本 +df_0_sample = df_0.sample(frac=0.8, random_state=42) + +# 合并得到训练集+测试集 +df_train_test = pd.concat([df_1, df_0_sample]) + +# 剩下的标签为'b'的样本作为查询集 +df_query = df_0.drop(df_0_sample.index) + +df_train, df_test = train_test_split(df_train_test, test_size=0.2, random_state=42) + +df_train.to_csv('dataset/url_train.csv', index=False) +df_test.to_csv('dataset/url_test.csv', index=False) +df_query.to_csv('dataset/url_query.csv', index=False) +""" + +data_train = pd.read_csv('dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('dataset/yelp/yelp_query.csv') + + +def yelp_insert(data): + data.loc[:, 'keywords'] = data['keywords'].str.split(' ') + data = data.explode('keywords') + data = data.reset_index(drop=True) + data['keywords'] = data['keywords'].astype(str) + data['keywords'] = data['keywords'].apply(str.lower) + + insert = pd.DataFrame() + insert['insert'] = data.apply(lib.network.insert, axis=1) + return insert + + +# 获取训练集中url_type为1的行的索引 +id_train = data_train[data_train['is_in'] == 1] + +# 获取测试集中url_type为1的行的索引 +id_test = data_test[data_test['is_in'] == 1] + +insert_train = yelp_insert(id_train) +insert_test = yelp_insert(id_test) +insert_query = yelp_insert(data_query) + +print(insert_train) +combined_data = np.concatenate((insert_train.values, insert_test.values), axis=0) +print(insert_query) + +# 定义布隆过滤器初始大小 +initial_size = 32 +max_size = 512 + +# 循环,从32开始,每次乘以2,直到256 +size = initial_size +while size <= max_size: + bloom_size = size * 1024 + bloom_filter = lib.bf_util.create_bloom_filter(dataset=combined_data, bf_size=bloom_size) + + # 统计假阳性率 + fp = 0 + fn = 0 + total_neg = 0 + # 遍历df_query中的每一个url列来查询布隆过滤器 + for index, row in insert_query.iterrows(): + url = row['insert'] + true_label = 0 # 0为负例,1为正例 + + if true_label == 0: + total_neg += 1 + if url in bloom_filter: + fp = fp + 1 + else: + print('contain positive query') + if url not in bloom_filter: + fn = fn + 1 + print(f'error for url {url}') + + print(f'fpr: {fp / total_neg}') + size *= 2 + diff --git a/model_generate.py b/model_generate.py new file mode 100644 index 0000000..7d4cd00 --- /dev/null +++ b/model_generate.py @@ -0,0 +1,78 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd +from sklearn.metrics import log_loss + +import lib.lgb_url +import lib.bf_util + + +def get_model(max_model_memory): + df_train = pd.read_csv('dataset/url_train.csv') + df_test = pd.read_csv('dataset/url_test.csv') + + X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) + y_train = df_train['url_type'].values.astype(np.float32) + X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) + y_test = df_test['url_type'].values.astype(np.float32) + + train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) + test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] + + best_params = None + best_score = float('inf') + + # 定义参数空间 + num_leaves_list = range(2, 32) # 叶子数量从2到31 + num_rounds_list = range(1, 21) # 训练轮次从1到20 + + # 循环遍历参数空间 + for num_leaves in num_leaves_list: + for num_rounds in num_rounds_list: + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': num_leaves, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, + } + + model = lgb.train(params, train_data, num_boost_round=num_rounds, valid_sets=[test_data]) + print( + f'num_leaves, num_rounds, memory = {num_leaves}, {num_rounds}, {lib.lgb_url.lgb_get_model_size(model)}') + + if lib.lgb_url.lgb_get_model_size(model) >= max_model_memory: + break + + # 在验证集上评估 + valid_pred = model.predict(X_test) + logloss_ = log_loss(y_test, valid_pred) + + # 记录最佳参数和得分 + if logloss_ < best_score: + best_score = logloss_ + best_params = {'num_leaves': num_leaves, 'num_rounds': num_rounds} + + # 设置参数 + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': best_params['num_leaves'], + 'learning_rate': 0.05, + 'feature_fraction': 0.9, + } + bst = lgb.train(params, train_data, best_params['num_rounds'], valid_sets=[test_data]) + + print('best num_leaves = ', best_params['num_leaves']) + print('best num_rounds = ', best_params['num_rounds']) + print('true data size = ', n_true) + + # 初始化变量 + model_size = lib.lgb_url.lgb_get_model_size(bst) + print("模型在内存中所占用的大小(字节):", model_size) + bst.save_model('best_bst_' + str(max_model_memory)) + return bst + + +get_model(20 * 1024) diff --git a/model_genetate_yelp.py b/model_genetate_yelp.py new file mode 100644 index 0000000..de4639f --- /dev/null +++ b/model_genetate_yelp.py @@ -0,0 +1,115 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd +from sklearn.metrics import log_loss +import lib.data_processing +import lib.network +import lib.lgb_url +import lib.bf_util + + +def get_model(max_model_memory): + data_train = pd.read_csv('dataset/yelp/yelp_train.csv') + data_test = pd.read_csv('dataset/yelp/yelp_test.csv') + data_query = pd.read_csv('dataset/yelp/yelp_query.csv') + + word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert + + X_train, y_train, train_insert = yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict) + X_test, y_test, test_insert = yelp_embedding(data_test, word_dict=word_dict, region_dict=region_dict) + X_train = X_train.values.astype(np.float32) + X_test = X_test.values.astype(np.float32) + y_train = y_train.values.astype(np.float32) + y_test = y_test.values.astype(np.float32) + # 4. 创建 LightGBM 数据集 + train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) + test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, free_raw_data=False) + + n_true = data_train[data_train['is_in'] == 1].shape[0] + data_test[data_test['is_in'] == 1].shape[0] + + best_params = None + best_score = float('inf') + + # 定义参数空间 + num_leaves_list = range(2, 32) # 叶子数量从2到31 + num_rounds_list = range(1, 21) # 训练轮次从1到20 + + # 循环遍历参数空间 + for num_leaves in num_leaves_list: + for num_rounds in num_rounds_list: + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': num_leaves, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, + } + + model = lgb.train(params, train_data, num_boost_round=num_rounds, valid_sets=[test_data]) + print( + f'num_leaves, num_rounds, memory = {num_leaves}, {num_rounds}, {lib.lgb_url.lgb_get_model_size(model)}') + + if lib.lgb_url.lgb_get_model_size(model) >= max_model_memory: + break + + # 在验证集上评估 + valid_pred = model.predict(X_test) + logloss_ = log_loss(y_test, valid_pred) + + # 记录最佳参数和得分 + if logloss_ < best_score: + best_score = logloss_ + best_params = {'num_leaves': num_leaves, 'num_rounds': num_rounds} + + # 设置参数 + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': best_params['num_leaves'], + 'learning_rate': 0.05, + 'feature_fraction': 0.9, + } + bst = lgb.train(params, train_data, best_params['num_rounds'], valid_sets=[test_data]) + + print('best num_leaves = ', best_params['num_leaves']) + print('best num_rounds = ', best_params['num_rounds']) + print('true data size = ', n_true) + + # 初始化变量 + model_size = lib.lgb_url.lgb_get_model_size(bst) + print("模型在内存中所占用的大小(字节):", model_size) + bst.save_model('best_bst_' + str(max_model_memory)) + return bst + + +get_model(20 * 1024) diff --git a/plbf/FastPLBF.py b/plbf/FastPLBF.py new file mode 100644 index 0000000..1bda5ec --- /dev/null +++ b/plbf/FastPLBF.py @@ -0,0 +1,119 @@ +from utils.ThresMaxDivDP import MaxDivDP, ThresMaxDiv +from utils.OptimalFPR import OptimalFPR +from utils.SpaceUsed import SpaceUsed +from utils.const import INF +from PLBF import PLBF + +import time +import argparse +import pandas as pd +from sklearn.model_selection import train_test_split + +class FastPLBF(PLBF): + def __init__(self, pos_keys: list, pos_scores: list[float], neg_scores: list[float], F: float, N: int, k: int): + """ + Args: + pos_keys (list): keys + pos_scores (list[float]): scores of keys + neg_scores (list[float]): scores of non-keys + F (float): target overall fpr + N (int): number of segments + k (int): number of regions + """ + + # assert + assert(isinstance(pos_keys, list)) + assert(isinstance(pos_scores, list)) + assert(len(pos_keys) == len(pos_scores)) + assert(isinstance(neg_scores, list)) + assert(isinstance(F, float)) + assert(0 < F < 1) + assert(isinstance(N, int)) + assert(isinstance(k, int)) + + for score in pos_scores: + assert(0 <= score <= 1) + for score in neg_scores: + assert(0 <= score <= 1) + + + self.F = F + self.N = N + self.k = k + self.n = len(pos_keys) + + + segment_thre_list, g, h = self.divide_into_segments(pos_scores, neg_scores) + self.find_best_t_and_f(segment_thre_list, g, h) + self.insert_keys(pos_keys, pos_scores) + + def find_best_t_and_f(self, segment_thre_list, g, h): + minSpaceUsed = INF + t_best = None + f_best = None + + DPKL, DPPre = MaxDivDP(g, h, self.N, self.k) + for j in range(self.k, self.N+1): + t = ThresMaxDiv(DPPre, j, self.k, segment_thre_list) + if t is None: + continue + f = OptimalFPR(g, h, t, self.F, self.k) + if minSpaceUsed > SpaceUsed(g, h, t, f, self.n): + minSpaceUsed = SpaceUsed(g, h, t, f, self.n) + t_best = t + f_best = f + + self.t = t_best + self.f = f_best + self.memory_usage_of_backup_bf = minSpaceUsed + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--N', action="store", dest="N", type=int, required=True, + help="N: the number of segments") + parser.add_argument('--k', action="store", dest="k", type=int, required=True, + help="k: the number of regions") + parser.add_argument('--F', action="store", dest="F", type=float, required=True, + help="F: the target overall fpr") + + results = parser.parse_args() + + DATA_PATH = results.data_path + N = results.N + k = results.k + F = results.F + + data = pd.read_csv(DATA_PATH) + negative_sample = data.loc[(data['label'] != 1)] + positive_sample = data.loc[(data['label'] == 1)] + train_negative, test_negative = train_test_split(negative_sample, test_size = 0.7, random_state = 0) + + pos_keys = list(positive_sample['key']) + pos_scores = list(positive_sample['score']) + train_neg_keys = list(train_negative['key']) + train_neg_scores = list(train_negative['score']) + test_neg_keys = list(test_negative['key']) + test_neg_scores = list(test_negative['score']) + + construct_start = time.time() + plbf = FastPLBF(pos_keys, pos_scores, train_neg_scores, F, N, k) + construct_end = time.time() + + # assert : no false negative + for key, score in zip(pos_keys, pos_scores): + assert(plbf.contains(key, score)) + + # test + fp_cnt = 0 + for key, score in zip(test_neg_keys, test_neg_scores): + if plbf.contains(key, score): + fp_cnt += 1 + + print(f"Construction Time: {construct_end - construct_start}") + print(f"Memory Usage of Backup BF: {plbf.memory_usage_of_backup_bf}") + print(f"False Positive Rate: {fp_cnt / len(test_neg_keys)} [{fp_cnt} / {len(test_neg_keys)}]") + + diff --git a/plbf/FastPLBF_M.py b/plbf/FastPLBF_M.py new file mode 100644 index 0000000..7735475 --- /dev/null +++ b/plbf/FastPLBF_M.py @@ -0,0 +1,153 @@ +import pandas as pd + +from .PLBF_M import PLBF_M +from .utils.ExpectedFPR import ExpectedFPR +from .utils.OptimalFPR_M import OptimalFPR_M +from .utils.SpaceUsed import SpaceUsed +from .utils.ThresMaxDivDP import MaxDivDP, ThresMaxDiv +from .utils.const import INF + + +class FastPLBF_M(PLBF_M): + def __init__(self, pos_keys: list, pos_scores: list[float], neg_scores: list[float], M: float, N: int, k: int): + """ + Args: + pos_keys (list): keys + pos_scores (list[float]): scores of keys + neg_scores (list[float]): scores of non-keys + M (float): the target memory usage for backup Bloom filters + N (int): number of segments + k (int): number of regions + """ + + # assert + assert (isinstance(pos_keys, list)) + assert (isinstance(pos_scores, list)) + assert (len(pos_keys) == len(pos_scores)) + assert (isinstance(neg_scores, list)) + assert (isinstance(M, float)) + assert (0 < M) + assert (isinstance(N, int)) + assert (isinstance(k, int)) + + for score in pos_scores: + assert (0 <= score <= 1) + for score in neg_scores: + assert (0 <= score <= 1) + + self.M = M + self.N = N + self.k = k + self.n = len(pos_keys) + self.fpr = 0.0 + + segment_thre_list, g, h = self.divide_into_segments(pos_scores, neg_scores) + self.find_best_t_and_f(segment_thre_list, g, h) + self.fpr = self.insert_keys(pos_keys, pos_scores, h) + + def find_best_t_and_f(self, segment_thre_list, g, h): + minExpectedFPR = INF + t_best = None + f_best = None + + DPKL, DPPre = MaxDivDP(g, h, self.N, self.k) + for j in range(self.k, self.N + 1): + t = ThresMaxDiv(DPPre, j, self.k, segment_thre_list) + if t is None: + continue + f = OptimalFPR_M(g, h, t, self.M, self.k, self.n) + if minExpectedFPR > ExpectedFPR(g, h, t, f, self.n): + minExpectedFPR = ExpectedFPR(g, h, t, f, self.n) + t_best = t + f_best = f + + # self.t = t_best + # self.f = f_best + # self.memory_usage_of_backup_bf = SpaceUsed(g, h, self.t, self.f, self.n) + if t_best is not None: + self.t = t_best + self.f = f_best + self.memory_usage_of_backup_bf = SpaceUsed(g, h, self.t, self.f, self.n) + else: + # 处理 t_best 为 None 的情况,例如抛出一个异常或设置默认值 + raise ValueError("No valid threshold (t) was found.") + + def get_fpr(self): + return self.fpr + + +def run(path, query_path, M, N, k): + data = pd.read_csv(path) + query_data = pd.read_csv(query_path) + negative_sample = data.loc[(data['label'] == 0)] + positive_sample = data.loc[(data['label'] == 1)] + # train_negative = negative_sample.sample(frac=0.8) + train_negative = negative_sample + query_negative = query_data.loc[(query_data['label'] == 0)] + + pos_keys = list(positive_sample['url']) + pos_scores = list(positive_sample['score']) + train_neg_scores = list(train_negative['score']) + + query_neg_keys = list(query_negative['url']) + query_neg_scores = list(query_negative['score']) + + plbf = FastPLBF_M(pos_keys, pos_scores, train_neg_scores, M, N, k) + + # test + fp_cnt = 0 + total = len(query_neg_keys) + for key, score in zip(query_neg_keys, query_neg_scores): + if plbf.contains(key, score): + fp_cnt += 1 + print(f"fpr: {float(fp_cnt) / total}") + print(f"Theoretical false positive rate: {plbf.get_fpr()}") + return float(fp_cnt) / total + +# if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, +# help="path of the dataset") +# parser.add_argument('--N', action="store", dest="N", type=int, required=True, +# help="N: the number of segments") +# parser.add_argument('--k', action="store", dest="k", type=int, required=True, +# help="k: the number of regions") +# parser.add_argument('--M', action="store", dest="M", type=float, required=True, +# help="M: the target memory usage for backup Bloom filters") +# +# results = parser.parse_args() +# +# DATA_PATH = results.data_path +# N = results.N +# k = results.k +# M = results.M +# +# data = pd.read_csv(DATA_PATH) +# negative_sample = data.loc[(data['label'] != 1)] +# positive_sample = data.loc[(data['label'] == 1)] +# train_negative, test_negative = train_test_split(negative_sample, test_size=0.7, random_state=0) +# +# pos_keys = list(positive_sample['key']) +# pos_scores = list(positive_sample['score']) +# train_neg_keys = list(train_negative['key']) +# train_neg_scores = list(train_negative['score']) +# test_neg_keys = list(test_negative['key']) +# test_neg_scores = list(test_negative['score']) +# +# construct_start = time.time() +# plbf = FastPLBF_M(pos_keys, pos_scores, train_neg_scores, M, N, k) +# construct_end = time.time() +# +# # assert : no false negative +# for key, score in zip(pos_keys, pos_scores): +# assert (plbf.contains(key, score)) +# +# # test +# fp_cnt = 0 +# for key, score in zip(test_neg_keys, test_neg_scores): +# if plbf.contains(key, score): +# fp_cnt += 1 +# +# print(f"Construction Time: {construct_end - construct_start}") +# print(f"Memory Usage of Backup BF: {plbf.memory_usage_of_backup_bf}") +# print(f"False Positive Rate: {fp_cnt / len(test_neg_keys)} [{fp_cnt} / {len(test_neg_keys)}]") diff --git a/plbf/FastPLBFpp.py b/plbf/FastPLBFpp.py new file mode 100644 index 0000000..36c6d0b --- /dev/null +++ b/plbf/FastPLBFpp.py @@ -0,0 +1,121 @@ +from utils.ThresMaxDivDP import fastMaxDivDP, ThresMaxDiv +from utils.OptimalFPR import OptimalFPR +from utils.SpaceUsed import SpaceUsed +from utils.const import INF +from PLBF import PLBF + +import time +import argparse +import pandas as pd +from sklearn.model_selection import train_test_split + +class FastPLBFpp(PLBF): + def __init__(self, pos_keys: list, pos_scores: list[float], neg_scores: list[float], F: float, N: int, k: int): + """ + Args: + pos_keys (list): keys + pos_scores (list[float]): scores of keys + neg_scores (list[float]): scores of non-keys + F (float): target overall fpr + N (int): number of segments + k (int): number of regions + """ + + # assert + assert(isinstance(pos_keys, list)) + assert(isinstance(pos_scores, list)) + assert(len(pos_keys) == len(pos_scores)) + assert(isinstance(neg_scores, list)) + assert(isinstance(F, float)) + assert(0 < F < 1) + assert(isinstance(N, int)) + assert(isinstance(k, int)) + + for score in pos_scores: + assert(0 <= score <= 1) + for score in neg_scores: + assert(0 <= score <= 1) + + + self.F = F + self.N = N + self.k = k + self.n = len(pos_keys) + + + segment_thre_list, g, h = self.divide_into_segments(pos_scores, neg_scores) + self.find_best_t_and_f(segment_thre_list, g, h) + self.insert_keys(pos_keys, pos_scores) + + def find_best_t_and_f(self, segment_thre_list, g, h): + minSpaceUsed = INF + t_best = None + f_best = None + + DPKL, DPPre = fastMaxDivDP(g, h, self.N, self.k) + for j in range(self.k, self.N+1): + t = ThresMaxDiv(DPPre, j, self.k, segment_thre_list) + if t is None: + continue + f = OptimalFPR(g, h, t, self.F, self.k) + if minSpaceUsed > SpaceUsed(g, h, t, f, self.n): + minSpaceUsed = SpaceUsed(g, h, t, f, self.n) + t_best = t + f_best = f + + self.t = t_best + self.f = f_best + self.memory_usage_of_backup_bf = minSpaceUsed + + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--N', action="store", dest="N", type=int, required=True, + help="N: the number of segments") + parser.add_argument('--k', action="store", dest="k", type=int, required=True, + help="k: the number of regions") + parser.add_argument('--F', action="store", dest="F", type=float, required=True, + help="F: the target overall fpr") + + results = parser.parse_args() + + DATA_PATH = results.data_path + N = results.N + k = results.k + F = results.F + + data = pd.read_csv(DATA_PATH) + negative_sample = data.loc[(data['label'] != 1)] + positive_sample = data.loc[(data['label'] == 1)] + train_negative, test_negative = train_test_split(negative_sample, test_size = 0.7, random_state = 0) + + pos_keys = list(positive_sample['key']) + pos_scores = list(positive_sample['score']) + train_neg_keys = list(train_negative['key']) + train_neg_scores = list(train_negative['score']) + test_neg_keys = list(test_negative['key']) + test_neg_scores = list(test_negative['score']) + + construct_start = time.time() + plbf = FastPLBFpp(pos_keys, pos_scores, train_neg_scores, F, N, k) + construct_end = time.time() + + # assert : no false negative + for key, score in zip(pos_keys, pos_scores): + assert(plbf.contains(key, score)) + + # test + fp_cnt = 0 + for key, score in zip(test_neg_keys, test_neg_scores): + if plbf.contains(key, score): + fp_cnt += 1 + + print(f"Construction Time: {construct_end - construct_start}") + print(f"Memory Usage of Backup BF: {plbf.memory_usage_of_backup_bf}") + print(f"False Positive Rate: {fp_cnt / len(test_neg_keys)} [{fp_cnt} / {len(test_neg_keys)}]") + + diff --git a/plbf/FastPLBFpp_M.py b/plbf/FastPLBFpp_M.py new file mode 100644 index 0000000..dd65569 --- /dev/null +++ b/plbf/FastPLBFpp_M.py @@ -0,0 +1,122 @@ +from utils.ThresMaxDivDP import fastMaxDivDP, ThresMaxDiv +from utils.OptimalFPR_M import OptimalFPR_M +from utils.SpaceUsed import SpaceUsed +from utils.ExpectedFPR import ExpectedFPR +from utils.const import INF +from PLBF_M import PLBF_M + +import time +import argparse +import pandas as pd +from sklearn.model_selection import train_test_split + +class FastPLBFpp_M(PLBF_M): + def __init__(self, pos_keys: list, pos_scores: list[float], neg_scores: list[float], M: float, N: int, k: int): + """ + Args: + pos_keys (list): keys + pos_scores (list[float]): scores of keys + neg_scores (list[float]): scores of non-keys + M (float): the target memory usage for backup Bloom filters + N (int): number of segments + k (int): number of regions + """ + + # assert + assert(isinstance(pos_keys, list)) + assert(isinstance(pos_scores, list)) + assert(len(pos_keys) == len(pos_scores)) + assert(isinstance(neg_scores, list)) + assert(isinstance(M, float)) + assert(0 < M) + assert(isinstance(N, int)) + assert(isinstance(k, int)) + + for score in pos_scores: + assert(0 <= score <= 1) + for score in neg_scores: + assert(0 <= score <= 1) + + + self.M = M + self.N = N + self.k = k + self.n = len(pos_keys) + + + segment_thre_list, g, h = self.divide_into_segments(pos_scores, neg_scores) + self.find_best_t_and_f(segment_thre_list, g, h) + self.insert_keys(pos_keys, pos_scores) + + def find_best_t_and_f(self, segment_thre_list, g, h): + minExpectedFPR = INF + t_best = None + f_best = None + + DPKL, DPPre = fastMaxDivDP(g, h, self.N, self.k) + for j in range(self.k, self.N+1): + t = ThresMaxDiv(DPPre, j, self.k, segment_thre_list) + if t is None: + continue + f = OptimalFPR_M(g, h, t, self.M, self.k, self.n) + if minExpectedFPR > ExpectedFPR(g, h, t, f, self.n): + minExpectedFPR = ExpectedFPR(g, h, t, f, self.n) + t_best = t + f_best = f + + self.t = t_best + self.f = f_best + self.memory_usage_of_backup_bf = SpaceUsed(g, h, t, f, self.n) + + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--N', action="store", dest="N", type=int, required=True, + help="N: the number of segments") + parser.add_argument('--k', action="store", dest="k", type=int, required=True, + help="k: the number of regions") + parser.add_argument('--M', action="store", dest="M", type=float, required=True, + help="M: the target memory usage for backup Bloom filters") + + results = parser.parse_args() + + DATA_PATH = results.data_path + N = results.N + k = results.k + M = results.M + + data = pd.read_csv(DATA_PATH) + negative_sample = data.loc[(data['label'] != 1)] + positive_sample = data.loc[(data['label'] == 1)] + train_negative, test_negative = train_test_split(negative_sample, test_size = 0.7, random_state = 0) + + pos_keys = list(positive_sample['key']) + pos_scores = list(positive_sample['score']) + train_neg_keys = list(train_negative['key']) + train_neg_scores = list(train_negative['score']) + test_neg_keys = list(test_negative['key']) + test_neg_scores = list(test_negative['score']) + + construct_start = time.time() + plbf = FastPLBFpp_M(pos_keys, pos_scores, train_neg_scores, M, N, k) + construct_end = time.time() + + # assert : no false negative + for key, score in zip(pos_keys, pos_scores): + assert(plbf.contains(key, score)) + + # test + fp_cnt = 0 + for key, score in zip(test_neg_keys, test_neg_scores): + if plbf.contains(key, score): + fp_cnt += 1 + + print(f"Construction Time: {construct_end - construct_start}") + print(f"Memory Usage of Backup BF: {plbf.memory_usage_of_backup_bf}") + print(f"False Positive Rate: {fp_cnt / len(test_neg_keys)} [{fp_cnt} / {len(test_neg_keys)}]") + + diff --git a/plbf/PLBF.py b/plbf/PLBF.py new file mode 100644 index 0000000..fe022ea --- /dev/null +++ b/plbf/PLBF.py @@ -0,0 +1,161 @@ +from utils.ThresMaxDivDP import ThresMaxDivDP +from utils.OptimalFPR import OptimalFPR +from utils.SpaceUsed import SpaceUsed +from utils.prList import prList +from utils.const import INF, EPS + +import time +import bisect +from bloom_filter import BloomFilter +import pandas as pd +from sklearn.model_selection import train_test_split + +import argparse + +class PLBF: + def __init__(self, pos_keys: list, pos_scores: list[float], neg_scores: list[float], F: float, N: int, k: int): + """ + Args: + pos_keys (list): keys + pos_scores (list[float]): scores of keys + neg_scores (list[float]): scores of non-keys + F (float): target overall fpr + N (int): number of segments + k (int): number of regions + """ + + # assert + assert(isinstance(pos_keys, list)) + assert(isinstance(pos_scores, list)) + assert(len(pos_keys) == len(pos_scores)) + assert(isinstance(neg_scores, list)) + assert(isinstance(F, float)) + assert(0 < F < 1) + assert(isinstance(N, int)) + assert(isinstance(k, int)) + + for score in pos_scores: + assert(0 <= score <= 1) + for score in neg_scores: + assert(0 <= score <= 1) + + + self.F = F + self.N = N + self.k = k + self.n = len(pos_keys) + + + segment_thre_list, g, h = self.divide_into_segments(pos_scores, neg_scores) + self.find_best_t_and_f(segment_thre_list, g, h) + self.insert_keys(pos_keys, pos_scores) + + + def divide_into_segments(self, pos_scores: list[float], neg_scores: list[float]): + segment_thre_list = [i / self.N for i in range(self.N + 1)] + g = prList(pos_scores, segment_thre_list) + h = prList(neg_scores, segment_thre_list) + return segment_thre_list, g, h + + def find_best_t_and_f(self, segment_thre_list, g, h): + minSpaceUsed = INF + t_best = None + f_best = None + + for j in range(self.k, self.N+1): + t = ThresMaxDivDP(g, h, j, self.k) + if t is None: + continue + f = OptimalFPR(g, h, t, self.F, self.k) + if minSpaceUsed > SpaceUsed(g, h, t, f, self.n): + minSpaceUsed = SpaceUsed(g, h, t, f, self.n) + t_best = t + f_best = f + + self.t = t_best + self.f = f_best + self.memory_usage_of_backup_bf = minSpaceUsed + + def insert_keys(self, pos_keys: list, pos_scores: list[float]): + pos_cnt_list = [0 for _ in range(self.k + 1)] + for score in pos_scores: + region_idx = self.get_region_idx(score) + pos_cnt_list[region_idx] += 1 + + + self.backup_bloom_filters = [None for _ in range(self.k + 1)] + for i in range(1, self.k + 1): + if 0 < self.f[i] < 1: + self.backup_bloom_filters[i] = BloomFilter(max_elements = pos_cnt_list[i], error_rate = self.f[i]) + elif self.f[i] == 0: + assert(pos_cnt_list[i] == 0) + self.backup_bloom_filters[i] = BloomFilter(max_elements = 1, error_rate = 1 - EPS) + + for key, score in zip(pos_keys, pos_scores): + region_idx = self.get_region_idx(score) + if self.backup_bloom_filters[region_idx] is not None: + self.backup_bloom_filters[region_idx].add(key) + + def get_region_idx(self, score): + region_idx = bisect.bisect_left(self.t, score) + if region_idx == 0: + region_idx = 1 + return region_idx + + def contains(self, key, score): + assert(0 <= score <= 1) + region_idx = self.get_region_idx(score) + if self.backup_bloom_filters[region_idx] is None: + return True + + return (key in self.backup_bloom_filters[region_idx]) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--N', action="store", dest="N", type=int, required=True, + help="N: the number of segments") + parser.add_argument('--k', action="store", dest="k", type=int, required=True, + help="k: the number of regions") + parser.add_argument('--F', action="store", dest="F", type=float, required=True, + help="F: the target overall fpr") + + results = parser.parse_args() + + DATA_PATH = results.data_path + N = results.N + k = results.k + F = results.F + + data = pd.read_csv(DATA_PATH) + negative_sample = data.loc[(data['label'] != 1)] + positive_sample = data.loc[(data['label'] == 1)] + train_negative, test_negative = train_test_split(negative_sample, test_size = 0.7, random_state = 0) + + pos_keys = list(positive_sample['key']) + pos_scores = list(positive_sample['score']) + train_neg_keys = list(train_negative['key']) + train_neg_scores = list(train_negative['score']) + test_neg_keys = list(test_negative['key']) + test_neg_scores = list(test_negative['score']) + + construct_start = time.time() + plbf = PLBF(pos_keys, pos_scores, train_neg_scores, F, N, k) + construct_end = time.time() + + # assert : no false negative + for key, score in zip(pos_keys, pos_scores): + assert(plbf.contains(key, score)) + + # test + fp_cnt = 0 + for key, score in zip(test_neg_keys, test_neg_scores): + if plbf.contains(key, score): + fp_cnt += 1 + + print(f"Construction Time: {construct_end - construct_start}") + print(f"Memory Usage of Backup BF: {plbf.memory_usage_of_backup_bf}") + print(f"False Positive Rate: {fp_cnt / len(test_neg_keys)} [{fp_cnt} / {len(test_neg_keys)}]") diff --git a/plbf/PLBF_M.py b/plbf/PLBF_M.py new file mode 100644 index 0000000..a3a7c71 --- /dev/null +++ b/plbf/PLBF_M.py @@ -0,0 +1,163 @@ +from .utils.ThresMaxDivDP import ThresMaxDivDP +from .utils.OptimalFPR_M import OptimalFPR_M +from .utils.SpaceUsed import SpaceUsed +from .utils.ExpectedFPR import ExpectedFPR +from .utils.prList import prList +from .utils.const import INF, EPS + +import time +import bisect +from bloom_filter import BloomFilter +import pandas as pd +from sklearn.model_selection import train_test_split + +import argparse + + +class PLBF_M: + def __init__(self, pos_keys: list, pos_scores: list[float], neg_scores: list[float], M: float, N: int, k: int): + """ + Args: + pos_keys (list): keys + pos_scores (list[float]): scores of keys + neg_scores (list[float]): scores of non-keys + M (float): the target memory usage for backup Bloom filters + N (int): number of segments + k (int): number of regions + """ + + # assert + assert (isinstance(pos_keys, list)) + assert (isinstance(pos_scores, list)) + assert (len(pos_keys) == len(pos_scores)) + assert (isinstance(neg_scores, list)) + assert (isinstance(M, float)) + assert (0 < M) + assert (isinstance(N, int)) + assert (isinstance(k, int)) + + for score in pos_scores: + assert (0 <= score <= 1) + for score in neg_scores: + assert (0 <= score <= 1) + + self.M = M + self.N = N + self.k = k + self.n = len(pos_keys) + self.fpr = 0.0 + + segment_thre_list, g, h = self.divide_into_segments(pos_scores, neg_scores) + self.find_best_t_and_f(segment_thre_list, g, h) + self.insert_keys(pos_keys, pos_scores, h) + + def divide_into_segments(self, pos_scores: list[float], neg_scores: list[float]): + segment_thre_list = [i / self.N for i in range(self.N + 1)] + g = prList(pos_scores, segment_thre_list) + h = prList(neg_scores, segment_thre_list) + return segment_thre_list, g, h + + def find_best_t_and_f(self, segment_thre_list, g, h): + minExpectedFPR = INF + t_best = None + f_best = None + + for j in range(self.k, self.N + 1): + t = ThresMaxDivDP(g, h, j, self.k) + if t is None: + continue + f = OptimalFPR_M(g, h, t, self.M, self.k, self.n) + if minExpectedFPR > ExpectedFPR(g, h, t, f, self.n): + minExpectedFPR = ExpectedFPR(g, h, t, f, self.n) + t_best = t + f_best = f + + self.t = t_best + self.f = f_best + self.memory_usage_of_backup_bf = SpaceUsed(g, h, t, f, self.n) + + def insert_keys(self, pos_keys: list, pos_scores: list[float], h): + pos_cnt_list = [0 for _ in range(self.k + 1)] + for score in pos_scores: + region_idx = self.get_region_idx(score) + pos_cnt_list[region_idx] += 1 + neg_pr_list = [h.acc_range(self.t[i - 1], self.t[i]) for i in range(1, self.k + 1)] + + self.backup_bloom_filters = [None for _ in range(self.k + 1)] + for i in range(1, self.k + 1): + if 0 < self.f[i] < 1: + self.backup_bloom_filters[i] = BloomFilter(max_elements=pos_cnt_list[i], error_rate=self.f[i]) + self.fpr += self.f[i] * neg_pr_list[i - 1] + elif self.f[i] == 0: + assert (pos_cnt_list[i] == 0) + self.backup_bloom_filters[i] = BloomFilter(max_elements=1, error_rate=1 - EPS) + self.fpr += (1 - EPS) * neg_pr_list[i - 1] + + for key, score in zip(pos_keys, pos_scores): + region_idx = self.get_region_idx(score) + if self.backup_bloom_filters[region_idx] is not None: + self.backup_bloom_filters[region_idx].add(key) + return self.fpr + + def get_region_idx(self, score): + region_idx = bisect.bisect_left(self.t, score) + if region_idx == 0: + region_idx = 1 + return region_idx + + def contains(self, key, score): + assert (0 <= score <= 1) + region_idx = self.get_region_idx(score) + if self.backup_bloom_filters[region_idx] is None: + return True + + return (key in self.backup_bloom_filters[region_idx]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--N', action="store", dest="N", type=int, required=True, + help="N: the number of segments") + parser.add_argument('--k', action="store", dest="k", type=int, required=True, + help="k: the number of regions") + parser.add_argument('--M', action="store", dest="M", type=float, required=True, + help="M: the target memory usage for backup Bloom filters") + + results = parser.parse_args() + + DATA_PATH = results.data_path + N = results.N + k = results.k + M = results.M + + data = pd.read_csv(DATA_PATH) + negative_sample = data.loc[(data['label'] != 1)] + positive_sample = data.loc[(data['label'] == 1)] + train_negative, test_negative = train_test_split(negative_sample, test_size=0.7, random_state=0) + + pos_keys = list(positive_sample['key']) + pos_scores = list(positive_sample['score']) + train_neg_keys = list(train_negative['key']) + train_neg_scores = list(train_negative['score']) + test_neg_keys = list(test_negative['key']) + test_neg_scores = list(test_negative['score']) + + construct_start = time.time() + plbf = PLBF_M(pos_keys, pos_scores, train_neg_scores, M, N, k) + construct_end = time.time() + + # assert : no false negative + for key, score in zip(pos_keys, pos_scores): + assert (plbf.contains(key, score)) + + # test + fp_cnt = 0 + for key, score in zip(test_neg_keys, test_neg_scores): + if plbf.contains(key, score): + fp_cnt += 1 + + print(f"Construction Time: {construct_end - construct_start}") + print(f"Memory Usage of Backup BF: {plbf.memory_usage_of_backup_bf}") + print(f"False Positive Rate: {fp_cnt / len(test_neg_keys)} [{fp_cnt} / {len(test_neg_keys)}]") diff --git a/plbf/__init__.py b/plbf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plbf/__pycache__/FastPLBF_M.cpython-311.pyc b/plbf/__pycache__/FastPLBF_M.cpython-311.pyc new file mode 100644 index 0000000..bb79878 Binary files /dev/null and b/plbf/__pycache__/FastPLBF_M.cpython-311.pyc differ diff --git a/plbf/__pycache__/PLBF_M.cpython-311.pyc b/plbf/__pycache__/PLBF_M.cpython-311.pyc new file mode 100644 index 0000000..95589c8 Binary files /dev/null and b/plbf/__pycache__/PLBF_M.cpython-311.pyc differ diff --git a/plbf/__pycache__/__init__.cpython-311.pyc b/plbf/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..8fc7e53 Binary files /dev/null and b/plbf/__pycache__/__init__.cpython-311.pyc differ diff --git a/plbf/main.py b/plbf/main.py new file mode 100644 index 0000000..6087cce --- /dev/null +++ b/plbf/main.py @@ -0,0 +1,68 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd + +import lib.lgb_url +from plbf import FastPLBF_M + +df_train = pd.read_csv('../dataset/url_train.csv') +df_test = pd.read_csv('../dataset/url_test.csv') +df_query = pd.read_csv('../dataset/url_query.csv') + +train_urls = df_train['url'] +test_urls = df_test['url'] +query_urls = df_query['url'] + +X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_train = df_train['url_type'].values.astype(np.float32) +X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_test = df_test['url_type'].values.astype(np.float32) +X_query = df_query.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_query = df_query['url_type'].values.astype(np.float32) + +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) +query_data = lgb.Dataset(X_query, label=y_query, free_raw_data=False) +n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] +n_test = len(df_test) + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) +y_pred_query = bst.predict(X_query) + +train_results = pd.DataFrame({ + 'url': train_urls, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_urls, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) +query_results = pd.DataFrame({ + 'url': query_urls, + 'label': y_query, + 'score': y_pred_query +}) +query_results.to_csv('query_results.csv', index=False) +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + print(bloom_size) + bloom_size = bloom_size * 8.0 + FastPLBF_M.run( + path='url_results.csv', + query_path='query_results.csv', + M=bloom_size, + N=50, + k=5, + ) diff --git a/plbf/tweet_main.py b/plbf/tweet_main.py new file mode 100644 index 0000000..630d464 --- /dev/null +++ b/plbf/tweet_main.py @@ -0,0 +1,108 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.lgb_url +from plbf import FastPLBF_M + +data_train = pd.read_csv('../dataset/tweet/tweet_train.csv') +data_test = pd.read_csv('../dataset/tweet/tweet_test.csv') +data_query = pd.read_csv('../dataset/tweet/tweet_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("tweet") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + positive_insert = insert[data_train['is_in'] == 1] + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num, positive_insert + + +X_train, y_train, train_insert, train_true, train_false, train_positive_insert = yelp_embedding(data_train, + word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false, test_positive_insert = yelp_embedding(data_test, + word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false, query_positive_insert = yelp_embedding(data_query, + word_dict=word_dict, + region_dict=region_dict) +print(train_positive_insert) +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) +y_pred_query = bst.predict(X_query) + +train_results = pd.DataFrame({ + 'url': train_insert, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_insert, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) +query_results = pd.DataFrame({ + 'url': query_insert, + 'label': y_query, + 'score': y_pred_query +}) +query_results.to_csv('query_results.csv', index=False) +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + print(bloom_size) + bloom_size = bloom_size * 8.0 + FastPLBF_M.run( + path='url_results.csv', + query_path='query_results.csv', + M=bloom_size, + N=50, + k=5, + ) diff --git a/plbf/utils/ExpectedFPR.py b/plbf/utils/ExpectedFPR.py new file mode 100644 index 0000000..34fd09e --- /dev/null +++ b/plbf/utils/ExpectedFPR.py @@ -0,0 +1,27 @@ +import math +from .prList import prList + +def ExpectedFPR(g: prList, h: prList, t: list[float], f: list[float], n: int) -> float: + """ + + Args: + g (prList): key density of each segmenet + h (prList): non-key density of each segmenet + t (list[float]): threshold boundaries of each region + f (list[float]): FPRs of each region + n (int): the number of keys + Returns: + float: expectedFPR + """ + + N = g.N + k = len(t) - 1 + + expectedFPR = 0 + for i in range(1, k+1): + neg_pr = h.acc_range(t[i-1], t[i]) + expectedFPR += neg_pr * f[i] + + return expectedFPR + + diff --git a/plbf/utils/OptimalFPR.py b/plbf/utils/OptimalFPR.py new file mode 100644 index 0000000..8f00f34 --- /dev/null +++ b/plbf/utils/OptimalFPR.py @@ -0,0 +1,87 @@ +from utils.prList import prList +from utils.const import EPS + +def OptimalFPR(g: prList, h: prList, t: list[float], F: float, k: int) -> list[float]: + """_summary_ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + t (list[float]): threshold boundaries of each region + F (float): target overall fpr + k (int): number of regions + + Returns: + list[float]: FPRs of each region (1-index) + """ + + + N = g.N + + pos_pr_list = [g.acc_range(t[i-1], t[i]) for i in range(1, k+1)] + neg_pr_list = [h.acc_range(t[i-1], t[i]) for i in range(1, k+1)] + + assert(abs(sum(pos_pr_list) - 1) < EPS) + assert(abs(sum(neg_pr_list) - 1) < EPS) + + valid_list = [True for i in range(k)] + + for i in range(k): + if neg_pr_list[i] == 0: + valid_list[i] = False + + while True: + valid_pos_pr_sum = 0 + valid_neg_pr_sum = 0 + invalid_pos_pr_sum = 0 + invalid_neg_pr_sum = 0 + for val, pos_pr, neg_pr in zip(valid_list, pos_pr_list, neg_pr_list): + if val: + valid_pos_pr_sum += pos_pr + valid_neg_pr_sum += neg_pr + else: + invalid_pos_pr_sum += pos_pr + invalid_neg_pr_sum += neg_pr + normed_F = (F - invalid_neg_pr_sum) / (1 - invalid_neg_pr_sum) + + if valid_pos_pr_sum == 0: + # The F is too large that the Bloom filter does not need to be used. + opt_fpr_list = [] + for i in range(k): + if valid_list[i]: + opt_fpr_list.append(0.0) + else: + opt_fpr_list.append(1.0) + # f to 1-index + opt_fpr_list.insert(0, None) + + return opt_fpr_list + + normed_pos_pr_list = [0 for i in range(k)] + normed_neg_pr_list = [0 for i in range(k)] + for idx, (pos_pr, neg_pr) in enumerate(zip(pos_pr_list, neg_pr_list)): + if valid_list[idx]: + normed_pos_pr_list[idx] = pos_pr / valid_pos_pr_sum + normed_neg_pr_list[idx] = neg_pr / valid_neg_pr_sum + + opt_fpr_list = [0 for i in range(k)] + for idx, (n_pos_pr, n_neg_pr) in enumerate(zip(normed_pos_pr_list, normed_neg_pr_list)): + if not valid_list[idx]: + opt_fpr_list[idx] = 1 + else: + opt_fpr_list[idx] = normed_F * n_pos_pr / n_neg_pr + + ok = True + for idx, opt_fpr in enumerate(opt_fpr_list): + if opt_fpr > 1: + ok = False + valid_list[idx] = False + if ok: + break + + # f to 1-index + opt_fpr_list.insert(0, None) + + assert(len(opt_fpr_list) == k+1) + return opt_fpr_list + diff --git a/plbf/utils/OptimalFPR_M.py b/plbf/utils/OptimalFPR_M.py new file mode 100644 index 0000000..efce8c9 --- /dev/null +++ b/plbf/utils/OptimalFPR_M.py @@ -0,0 +1,93 @@ +from .prList import prList +from .const import EPS +import math + +def calc_K_sum(pos_pr_list: list[float], neg_pr_list: list[float], valid_list: list[bool]) -> float: + K_sum = 0 + for pos_pr, neg_pr, valid in zip(pos_pr_list, neg_pr_list, valid_list): + if not valid: + continue + if pos_pr == 0: + continue + K_sum += pos_pr * math.log2(pos_pr / neg_pr) + return K_sum + +def calc_G_sum(pos_pr_list: list[float], neg_pr_list: list[float], valid_list: list[bool]) -> float: + G_sum = 0 + for pos_pr, neg_pr, valid in zip(pos_pr_list, neg_pr_list, valid_list): + if valid: + continue + G_sum += pos_pr + return G_sum + +def some_f_i_is_greater_than_1(f: list[float]) -> bool: + for f_i in f: + if f_i > 1: + return True + return False + +def OptimalFPR_M(g: prList, h: prList, t: list[float], M: float, k: int, n: int) -> list[float]: + """_summary_ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + t (list[float]): threshold boundaries of each region + M (float): the target memory usage for backup Bloom filters + k (int): number of regions + n (int): number of keys + + Returns: + list[float]: FPRs of each region (1-index) + """ + + c = math.log2(math.e) + + pos_pr_list = [g.acc_range(t[i-1], t[i]) for i in range(1, k+1)] + neg_pr_list = [h.acc_range(t[i-1], t[i]) for i in range(1, k+1)] + + assert(abs(sum(pos_pr_list) - 1) < EPS) + assert(abs(sum(neg_pr_list) - 1) < EPS) + + valid_list = [True for i in range(k)] + + for i in range(k): + if neg_pr_list[i] == 0: + valid_list[i] = False + + G_sum = calc_G_sum(pos_pr_list, neg_pr_list, valid_list) + K_sum = calc_K_sum(pos_pr_list, neg_pr_list, valid_list) + + beta = (M + c * n * K_sum) / (c * n * (1 - G_sum)) + + opt_fpr_list = [0 for i in range(k)] + for i in range(k): + if not valid_list[i]: + opt_fpr_list[i] = 1 + else: + opt_fpr_list[i] = math.pow(2, -beta) * pos_pr_list[i] / neg_pr_list[i] + + while some_f_i_is_greater_than_1(opt_fpr_list): + for i in range(k): + if opt_fpr_list[i] > 1: + valid_list[i] = False + opt_fpr_list[i] = 1 + + G_sum = calc_G_sum(pos_pr_list, neg_pr_list, valid_list) + K_sum = calc_K_sum(pos_pr_list, neg_pr_list, valid_list) + + beta = (M + c * n * K_sum) / (c * n * (1 - G_sum)) + + for i in range(k): + if not valid_list[i]: + opt_fpr_list[i] = 1 + else: + opt_fpr_list[i] = math.pow(2, -beta) * pos_pr_list[i] / neg_pr_list[i] + + + # f to 1-index + opt_fpr_list.insert(0, None) + + assert(len(opt_fpr_list) == k+1) + return opt_fpr_list + diff --git a/plbf/utils/SpaceUsed.py b/plbf/utils/SpaceUsed.py new file mode 100644 index 0000000..db30cea --- /dev/null +++ b/plbf/utils/SpaceUsed.py @@ -0,0 +1,33 @@ +import math +from .prList import prList + +def SpaceUsed(g: prList, h: prList, t: list[float], f: list[float], n: int) -> float: + """ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + t (list[float]): threshold boundaries of each region + f (list[float]): FPRs of each region + n (int): the number of keys + Returns: + float: spaceUsed + """ + + N = g.N + k = len(t) - 1 + + spaceUsed = 0 + for i in range(1, k+1): + pos_pr = g.acc_range(t[i-1], t[i]) + pos_num = pos_pr * n + if pos_num == 0: + continue + fpr = f[i] + hash_num = math.log(fpr) / math.log(0.5) + m = hash_num * pos_num / math.log(2) + spaceUsed += m + + return spaceUsed + + diff --git a/plbf/utils/ThresMaxDivDP.py b/plbf/utils/ThresMaxDivDP.py new file mode 100644 index 0000000..f44df6e --- /dev/null +++ b/plbf/utils/ThresMaxDivDP.py @@ -0,0 +1,124 @@ +from typing import Tuple +from .prList import prList +from .calc_DPKL import calc_DPKL +from .fast_calc_DPKL import fast_calc_DPKL +from .const import INF + + +def ThresMaxDivDP(g: prList, h: prList, j: int, k: int) -> list[float]: + """ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + j (int): j-th to N-th segments are clustered as k-th region + k (int): number of regions + + Returns: + list[float]: t (threshold boundaries of each region) + """ + + assert(isinstance(g, prList)) + assert(isinstance(h, prList)) + assert(isinstance(j, int)) + assert(isinstance(k, int)) + N = g.N + assert(h.N == N) + + + DPKL, DPPre = calc_DPKL(g, h, k, j) + + # tracing the transitions backward from DPPre[j-1][k-1] + if DPPre[j-1][k-1] is None: + return None + + reversed_t = [1.0] + + now = j-1 + reversed_t.append(g.thre_list[now]) + for i in reversed(range(1, k)): + now = DPPre[now][i] + reversed_t.append(g.thre_list[now]) + + t = list(reversed(reversed_t)) + + assert(len(t) == k+1) + return t + + + + + + +def MaxDivDP(g: prList, h: prList, N: int, k: int) -> Tuple[list[list[float]], list[list[int]]]: + """ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + N (int): number of segments + k (int): number of regions + + Returns: + Tuple[list[list[float]], list[list[int]]]: DPKL, DPPre + """ + + assert(isinstance(g, prList)) + assert(isinstance(h, prList)) + assert(isinstance(N, int)) + assert(isinstance(k, int)) + N = g.N + assert(h.N == N) + + DPKL, DPPre = calc_DPKL(g, h, k) + return DPKL, DPPre + +def ThresMaxDiv(DPPre: list[list[int]], j: int, k: int, thre_list: list[float]): + """ + + Args: + DPPre (list[list[int]]): DPPre + j (int): j-th to N-th segments are clustered as k-th region + k (int): number of regions + """ + + assert(isinstance(DPPre, list)) + assert(isinstance(j, int)) + assert(isinstance(k, int)) + + # tracing the transitions backward from DPPre[j-1][k-1] + if DPPre[j-1][k-1] is None: + return None + + reversed_t = [1.0] + + now = j-1 + reversed_t.append(thre_list[now]) + for i in reversed(range(1, k)): + now = DPPre[now][i] + if now is None: + return None + reversed_t.append(thre_list[now]) + + t = list(reversed(reversed_t)) + + assert(len(t) == k+1) + return t + +def fastMaxDivDP(g: prList, h: prList, N: int, k: int) -> Tuple[list[list[float]], list[list[int]]]: + """ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + N (int): number of segments + k (int): number of regions + + Returns: + Tuple[list[list[float]], list[list[int]]]: DPKL, DPPre + """ + + N = g.N + + DPKL, DPPre = fast_calc_DPKL(g, h, k) + return DPKL, DPPre diff --git a/plbf/utils/__init__.py b/plbf/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plbf/utils/__pycache__/ExpectedFPR.cpython-311.pyc b/plbf/utils/__pycache__/ExpectedFPR.cpython-311.pyc new file mode 100644 index 0000000..b9a6183 Binary files /dev/null and b/plbf/utils/__pycache__/ExpectedFPR.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/OptimalFPR_M.cpython-311.pyc b/plbf/utils/__pycache__/OptimalFPR_M.cpython-311.pyc new file mode 100644 index 0000000..cad830e Binary files /dev/null and b/plbf/utils/__pycache__/OptimalFPR_M.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/SpaceUsed.cpython-311.pyc b/plbf/utils/__pycache__/SpaceUsed.cpython-311.pyc new file mode 100644 index 0000000..952a4eb Binary files /dev/null and b/plbf/utils/__pycache__/SpaceUsed.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/ThresMaxDivDP.cpython-311.pyc b/plbf/utils/__pycache__/ThresMaxDivDP.cpython-311.pyc new file mode 100644 index 0000000..af6962c Binary files /dev/null and b/plbf/utils/__pycache__/ThresMaxDivDP.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/__init__.cpython-311.pyc b/plbf/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..f326946 Binary files /dev/null and b/plbf/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/calc_DPKL.cpython-311.pyc b/plbf/utils/__pycache__/calc_DPKL.cpython-311.pyc new file mode 100644 index 0000000..0fc852e Binary files /dev/null and b/plbf/utils/__pycache__/calc_DPKL.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/const.cpython-311.pyc b/plbf/utils/__pycache__/const.cpython-311.pyc new file mode 100644 index 0000000..63b0fe0 Binary files /dev/null and b/plbf/utils/__pycache__/const.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/fast_calc_DPKL.cpython-311.pyc b/plbf/utils/__pycache__/fast_calc_DPKL.cpython-311.pyc new file mode 100644 index 0000000..4fa69b0 Binary files /dev/null and b/plbf/utils/__pycache__/fast_calc_DPKL.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/matrix_problem_on_monotone_matrix.cpython-311.pyc b/plbf/utils/__pycache__/matrix_problem_on_monotone_matrix.cpython-311.pyc new file mode 100644 index 0000000..02a4258 Binary files /dev/null and b/plbf/utils/__pycache__/matrix_problem_on_monotone_matrix.cpython-311.pyc differ diff --git a/plbf/utils/__pycache__/prList.cpython-311.pyc b/plbf/utils/__pycache__/prList.cpython-311.pyc new file mode 100644 index 0000000..a24cc7f Binary files /dev/null and b/plbf/utils/__pycache__/prList.cpython-311.pyc differ diff --git a/plbf/utils/calc_DPKL.py b/plbf/utils/calc_DPKL.py new file mode 100644 index 0000000..bc43c79 --- /dev/null +++ b/plbf/utils/calc_DPKL.py @@ -0,0 +1,47 @@ +import math +from typing import Tuple +from .prList import prList +from .const import INF + + +def calc_DPKL(g: prList, h: prList, k: int, j: int = None) -> Tuple[list[list[float]], list[list[int]]]: + """ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + k (int): number of regions + j (int): 1-th to j-th segments are clustered + + Returns: + Tuple[list[list[float]], list[list[int]]]: DPKL, DPPre + """ + + N = g.N + if j is None: + j = N + + DPKL = [[-INF for _ in range(k + 1)] for _ in range(j + 1)] + DPPre = [[None for _ in range(k + 1)] for _ in range(j + 1)] + DPKL[0][0] = 0 + for n in range(1, j + 1): + for q in range(1, k + 1): + for i in range(1, n + 1): + # i-th to n-th segments are clustered into q-th region + + Pos = g.acc_range_idx(i, n) + Neg = h.acc_range_idx(i, n) + + if Neg == 0: + continue + if Pos == 0: + tmp_sum = DPKL[i-1][q-1] + 0 + else: + tmp_sum = DPKL[i-1][q-1] + Pos * math.log(Pos / Neg) + + if DPKL[n][q] < tmp_sum: + DPKL[n][q] = tmp_sum + DPPre[n][q] = i-1 + + return DPKL, DPPre + diff --git a/plbf/utils/const.py b/plbf/utils/const.py new file mode 100644 index 0000000..ba023ac --- /dev/null +++ b/plbf/utils/const.py @@ -0,0 +1,3 @@ +EPS = 1e-8 +INF = float('inf') + diff --git a/plbf/utils/fast_calc_DPKL.py b/plbf/utils/fast_calc_DPKL.py new file mode 100644 index 0000000..27cb2e3 --- /dev/null +++ b/plbf/utils/fast_calc_DPKL.py @@ -0,0 +1,65 @@ +import math +from typing import Tuple +from .prList import prList +from .const import INF +from .matrix_problem_on_monotone_matrix import matrix_problem_on_monotone_matrix + + + +def fast_calc_DPKL(g: prList, h: prList, k: int) -> Tuple[list[list[float]], list[list[int]]]: + """ + + Args: + g (prList): key density of each segmenet + h (prList): non-keye density of each segmenet + k (int): number of regions + + Returns: + Tuple[list[list[float]], list[list[int]]]: DPKL, DPPre + """ + + N = g.N + + DPKL = [[-INF for _ in range(k + 1)] for _ in range(N + 1)] + DPPre = [[None for _ in range(k + 1)] for _ in range(N + 1)] + DPKL[0][0] = 0 + for j in range(1, k + 1): + + def func_A(p: int, i: int) -> float: + """ + func_A(p, i) + = A_{pi} + + = { -INF (i = p+1, p+2, ..., N-1) + { DPKL[i-1][j-1] + dkl(i, p) (i = 1, ..., p) + + Args: + p (int): \in {1 ... N} + i (int): \in {1 ... N} + + Returns: + float: A_{pi} + """ + + if i >= p+1: + return -INF + + Pos = g.acc_range_idx(i, p) + Neg = h.acc_range_idx(i, p) + + if Neg == 0: + return -INF + if Pos == 0: + return DPKL[i-1][j-1] + 0 + + return DPKL[i-1][j-1] + Pos * math.log(Pos / Neg) + + max_args = matrix_problem_on_monotone_matrix(func_A, N, N) + + for n in range(1, N + 1): + pre = max_args[n] + DPKL[n][j] = func_A(n, pre) + DPPre[n][j] = pre-1 + + return DPKL, DPPre + diff --git a/plbf/utils/matrix_problem_on_monotone_matrix.py b/plbf/utils/matrix_problem_on_monotone_matrix.py new file mode 100644 index 0000000..683aa1d --- /dev/null +++ b/plbf/utils/matrix_problem_on_monotone_matrix.py @@ -0,0 +1,40 @@ +import math +from collections.abc import Callable +from .const import INF + +def matrix_problem_on_monotone_matrix(f: Callable[[int, int], float], n: int, m: int) -> list[int]: + + """ + + Args: + f (Callable[[int, int], float]): returns B_{ij} (B is a monotone matrix) ({1 ... n} x {1 ... m} -> float). + n (int): B is a n * m matrix + m (int): B is a n * m matrix + + Returns: + list[int]: a[i] = J(i) (i.e., a[i] is the smallest j that B_{i,j} equals the maximum value of the i-th row of B). + """ + + a = [None for i in range(n + 1)] + + def CalcJ(i, jl, jr): + max = -INF + argmax = jl + for j in range(jl, jr+1): + if f(i, j) > max: + max = f(i, j) + argmax = j + return argmax + + def RecSolveMP(il, ir, jl, jr): + if il > ir: + return + i = math.floor((il + ir) / 2) + j = CalcJ(i, jl, jr) + a[i] = j + RecSolveMP(il, i-1, jl, j) + RecSolveMP(i+1, ir, j, jr) + + RecSolveMP(1, n, 1, m) + return a + diff --git a/plbf/utils/prList.py b/plbf/utils/prList.py new file mode 100644 index 0000000..654e07a --- /dev/null +++ b/plbf/utils/prList.py @@ -0,0 +1,125 @@ +import math +from .const import EPS +import bisect + +class prList: + + def __init__(self, scores: list[float], thre_list: list[float]): + """ + + Args: + scores (list[float]): a list of scores + thre_list (list[float]): thresholds for divide scores into segment + + """ + + assert(thre_list[0] == 0) + assert(thre_list[-1] == 1) + + self.thre_list = thre_list + self.N = len(thre_list) - 1 + + cnt_list = [0 for _ in range(self.N + 1)] + for score in scores: + assert(0 <= score <= 1) + + segment_idx = bisect.bisect_left(thre_list, score) + if segment_idx == 0: + assert(score == 0) + segment_idx = 1 + + assert(1 <= segment_idx <= self.N) + + cnt_list[segment_idx] += 1 + + total_cnt = len(scores) + + self.pr = [0.0 for i in range(self.N+1)] + self.accPr = [0.0 for i in range(self.N+1)] + for i in range(1, self.N + 1): + self.pr[i] = cnt_list[i] / total_cnt + self.accPr[i] = self.accPr[i - 1] + self.pr[i] + + assert(abs(self.accPr[self.N] - 1.0) < EPS), self.accPr[self.N] + + def get_th_idx(self, score: float) -> int: + """ + 0 --> 0 + 1/N --> 1 + 2/N --> 2 + ... + N/N --> N + + Args: + score (float): score + + Returns: + int: idx + """ + + idx = math.floor(score * self.N + 0.5) + assert(abs(idx - score * self.N) < 1e-9) + + return idx + + + def acc(self, score: float) -> float: + """ + + Args: + score (float): \in [0, 1] + Returns: + float: accumulated probability in [0, score] + """ + + idx = self.get_th_idx(score) + + return self.accPr[idx] + + + def acc_range(self, score_l: float, score_r: float) -> float: + """ + + Args: + score_l (float): \in [0, 1] + score_r (float): \in [0, 1] + + Returns: + float: accumulated probability in [score_l, score_r] + """ + + idx_l = self.get_th_idx(score_l) + idx_r = self.get_th_idx(score_r) + + return self.accPr[idx_r] - self.accPr[idx_l] + + def acc_idx(self, idx: int) -> float: + """ + + Args: + idx (int): idx \in {1 ... N} + + Returns: + float: sum of self.pr[1...idx] + """ + + assert(1 <= idx <= self.N) + + return self.accPr[idx] + + def acc_range_idx(self, idx_l: int, idx_r: int) -> float: + """ + + Args: + idx_l (int): idx \in {1 ... N} + idx_l (int): idx \in {1 ... N} + + Returns: + float: sum of self.pr[idx_l...idx_r] + """ + + assert(1 <= idx_l <= self.N) + assert(1 <= idx_r <= self.N) + + return self.accPr[idx_r] - self.accPr[idx_l - 1] + diff --git a/plbf/yelp_main.py b/plbf/yelp_main.py new file mode 100644 index 0000000..a81c190 --- /dev/null +++ b/plbf/yelp_main.py @@ -0,0 +1,110 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.lgb_url +from plbf import FastPLBF_M + + +data_train = pd.read_csv('../dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('../dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('../dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + positive_insert = insert[data_train['is_in'] == 1] + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num, positive_insert + + +X_train, y_train, train_insert, train_true, train_false, train_positive_insert = yelp_embedding(data_train, + word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false, test_positive_insert = yelp_embedding(data_test, + word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false, query_positive_insert = yelp_embedding(data_query, + word_dict=word_dict, + region_dict=region_dict) +print(train_positive_insert) +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) +y_pred_query = bst.predict(X_query) + +train_results = pd.DataFrame({ + 'url': train_insert, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_insert, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) +query_results = pd.DataFrame({ + 'url': query_insert, + 'label': y_query, + 'score': y_pred_query +}) +query_results.to_csv('query_results.csv', index=False) +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + print(bloom_size) + bloom_size = bloom_size * 8.0 + FastPLBF_M.run( + path='url_results.csv', + query_path='query_results.csv', + M=bloom_size, + N=50, + k=5, + ) diff --git a/sandwich-lbf/Bloom_filter.py b/sandwich-lbf/Bloom_filter.py new file mode 100644 index 0000000..48afcd7 --- /dev/null +++ b/sandwich-lbf/Bloom_filter.py @@ -0,0 +1,134 @@ +import numpy as np +import pandas as pd +from sklearn.utils import murmurhash3_32 +import random +import serialize +import argparse +from pathlib import Path +import time + + +class hashfunc(object): + def __init__(self, m): + self.m = m + self.ss = random.randint(1, 99999999) + + def __call__(self, x): + return murmurhash3_32(x, seed=self.ss) % self.m + + +''' +Class for Standard Bloom filter +''' + + +class BloomFilter(): + def __init__(self, n, hash_len): + self.n = n + self.hash_len = int(hash_len) + if (self.hash_len == 0): + raise SyntaxError('The hash table is empty') + if (self.n > 0) & (self.hash_len > 0): + self.k = max(1, int(self.hash_len / n * 0.6931472)) + elif (self.n == 0): + self.k = 1 + self.h = [] + for i in range(self.k): + self.h.append(hashfunc(self.hash_len)) + self.table = np.zeros(self.hash_len, dtype=int) + + def insert(self, key): + if self.hash_len == 0: + raise SyntaxError('cannot insert to an empty hash table') + for i in key: + for j in range(self.k): + t = self.h[j](i) + self.table[t] = 1 + + # def test(self, key): + # test_result = 0 + # match = 0 + # if self.hash_len > 0: + # for j in range(self.k): + # t = self.h[j](key) + # match += 1*(self.table[t] == 1) + # if match == self.k: + # test_result = 1 + # return test_result + + def test(self, keys, single_key=True): + if single_key: + test_result = 0 + match = 0 + if self.hash_len > 0: + for j in range(self.k): + t = self.h[j](keys) + match += 1 * (self.table[t] == 1) + if match == self.k: + test_result = 1 + else: + test_result = np.zeros(len(keys)) + ss = 0 + if self.hash_len > 0: + for key in keys: + match = 0 + for j in range(self.k): + t = self.h[j](key) + match += 1 * (self.table[t] == 1) + if match == self.k: + test_result[ss] = 1 + ss += 1 + return test_result + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, + help="path of the dataset") + parser.add_argument('--size_of_BF', action="store", dest="R_sum", type=int, required=True, help="size of the BF") + parser.add_argument('--pos_ratio', action="store", dest="pos_ratio", type=float, required=True, + help="size of the BF", default=0.7) + parser.add_argument('--neg_ratio', action="store", dest="neg_ratio", type=float, required=True, + help="size of the BF", default=0.7) + parser.add_argument("--negTest_ratio", action="store", dest="negTest_ratio", type=float, default=1.0) + parser.add_argument("--test_path", action="store", dest="test_path", type=str, default=None) + parser.add_argument("--save_path", action="store", dest="save_path", type=str, default=None) + seed = 22012022 + rs = np.random.RandomState(seed) + random.seed(seed) + + args = parser.parse_args() + data_path = Path(args.data_path) + data_test_path = Path(args.test_path) if args.test_path is not None else None + R_sum = args.R_sum + pos_ratio = args.pos_ratio + neg_ratio = args.neg_ratio + negTest_ratio = args.negTest_ratio + data_test_path = args.test_path + dataset = serialize.load_dataset(data_path) + neg_label = serialize.find_neg_label(dataset) + dataset_test = serialize.load_dataset(data_path) if data_test_path is not None else None + print( + f"Total samples: {len(dataset.index)}. (Pos, Neg): ({len(dataset[(dataset['label'] == 1)])}, {len(dataset[(dataset['label'] == neg_label)])})") + data, query_negative = serialize.divide_dataset(dataset, dataset_test, pos_ratio, neg_ratio, negTest_ratio, rs) + del (dataset) + print( + f"Samples for filters training: {len(data.index)}. (Pos, Neg): ({len(data[(data['label'] == 1)])}, {len(data[(data['label'] == neg_label)])})") + print(f"Samples for filters testing: {len(query_negative.index)}") + # print(query_negative.iloc[:, 0].head()) + + negative_sample = data.loc[(data.iloc[:, -1] == neg_label)] # label? + positive_sample = data.loc[(data.iloc[:, -1] == 1)] + query = positive_sample.iloc[:, 0] + n = len(query) + bloom_filter = BloomFilter(n, R_sum) + bloom_filter.insert(query) + start = time.time() + n1 = bloom_filter.test(query_negative.iloc[:, 0], single_key=False) + end = time.time() + print('False positive rate: ', sum(n1) / len(query_negative), ", Time:", (end - start) / len(query_negative)) + if args.save_path: + file1 = open(args.save_path, 'a') + file1.write( + str(data_path) + "," + str(R_sum) + "," + str(np.round(sum(n1) / len(query_negative), 5)) + "," + str( + (end - start) / len(query_negative)) + "\n") \ No newline at end of file diff --git a/sandwich-lbf/__init__.py b/sandwich-lbf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sandwich-lbf/__pycache__/Bloom_filter.cpython-311.pyc b/sandwich-lbf/__pycache__/Bloom_filter.cpython-311.pyc new file mode 100644 index 0000000..28d7bca Binary files /dev/null and b/sandwich-lbf/__pycache__/Bloom_filter.cpython-311.pyc differ diff --git a/sandwich-lbf/__pycache__/slbf.cpython-311.pyc b/sandwich-lbf/__pycache__/slbf.cpython-311.pyc new file mode 100644 index 0000000..c7ec2f1 Binary files /dev/null and b/sandwich-lbf/__pycache__/slbf.cpython-311.pyc differ diff --git a/sandwich-lbf/main.py b/sandwich-lbf/main.py new file mode 100644 index 0000000..6790ab1 --- /dev/null +++ b/sandwich-lbf/main.py @@ -0,0 +1,62 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd + +import lib.lgb_url +import slbf + + +df_train = pd.read_csv('../dataset/url_train.csv') +df_test = pd.read_csv('../dataset/url_test.csv') +df_query = pd.read_csv('../dataset/url_query.csv') + +train_urls = df_train['url'] +test_urls = df_test['url'] +query_urls = df_query['url'] + +X_train = df_train.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_train = df_train['url_type'].values.astype(np.float32) +X_test = df_test.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_test = df_test['url_type'].values.astype(np.float32) +X_query = df_query.drop(columns=['url', 'url_type']).values.astype(np.float32) +y_query = df_query['url_type'].values.astype(np.float32) + +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False) +n_true = df_train[df_train['url_type'] == 1].shape[0] + df_test[df_test['url_type'] == 1].shape[0] +n_test = len(df_test) + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) + +train_results = pd.DataFrame({ + 'url': train_urls, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_urls, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) + +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + slbf.run( + R_sum=bloom_size * 8, + path='url_results.csv', + model=bst, + X_query=X_query, + y_query=y_query, + query_urls=query_urls + ) + size *= 2 diff --git a/sandwich-lbf/slbf.py b/sandwich-lbf/slbf.py new file mode 100644 index 0000000..8893418 --- /dev/null +++ b/sandwich-lbf/slbf.py @@ -0,0 +1,158 @@ +import math + +import numpy as np +import pandas as pd +from Bloom_filter import BloomFilter + + +class SLBF: + def __init__(self, keys, filter_size_b1, filter_size_b2, threshold): + ''' + keys: df in the following form + (index) data label score + ''' + self.filter_size_b1 = filter_size_b1 + self.filter_size_b2 = filter_size_b2 + self.threshold = threshold + + self.initial_keys = keys + if filter_size_b1 > 0: + self.initial_bf = BloomFilter(len(self.initial_keys), + filter_size_b1 * len(self.initial_keys)) + self.initial_bf.insert(self.initial_keys.iloc[:, 0]) + else: + self.initial_bf = None + self.backup_keys = keys[(keys.iloc[:, -1] <= threshold)] + self.backup_bf = BloomFilter(len(self.backup_keys), filter_size_b2 * len(self.initial_keys)) + self.backup_bf.insert(self.backup_keys.iloc[:, 0]) + + def query(self, query_set): + ml_false_positive = (query_set.iloc[:, + -1] > self.threshold) # maschera falsi positivi generati dal modello rispetto alla soglia considerata, + ml_true_negative = (query_set.iloc[:, + -1] <= self.threshold) # maschera veri negativi generati dal modello rispetto alla soglia considerata + # Calcolo FPR + initial_bf_false_positive = self.initial_bf.test(query_set.iloc[:, 0], + single_key=False) if self.initial_bf is not None else np.full( + len(query_set), True) # if initial BF is not present, all of query samples are "false positive" + ml_false_positive_list = query_set.iloc[:, 0][(initial_bf_false_positive) & (ml_false_positive)] + ml_true_negative_list = query_set.iloc[:, 0][(initial_bf_false_positive) & (ml_true_negative)] + backup_bf_false_positive = self.backup_bf.test(ml_true_negative_list, single_key=False) + total_false_positive = sum(backup_bf_false_positive) + len(ml_false_positive_list) + + return total_false_positive + + def query_single_key(self, key, score): + if self.initial_bf.test(key): + if score > self.threshold: + return True + else: + if self.backup_bf.test(key): + return True + else: + return False + else: + return False + + +def train_slbf(filter_size, query_train_set, keys): + # train_dataset = np.array(pd.concat([query_train_set, keys]).iloc[:, -1]) + # thresholds_list = [np.quantile(train_dataset, i * (1 / quantile_order)) for i in + # range(1, quantile_order)] if quantile_order < len(train_dataset) else np.sort(train_dataset) + # thresh_third_quart_idx = (3 * len(thresholds_list) - 1) // 4 + thresholds_list = [round(i * 0.01, 2) for i in range(1, 100)] + + fp_opt = query_train_set.shape[0] + slbf_opt = None # cambiare + # print("thresholds_list:", thresholds_list) + for threshold in thresholds_list: + ml_false_positive = (query_train_set.iloc[:, -1] > threshold) + ml_false_negative = (keys.iloc[:, -1] <= threshold) + + FP = query_train_set[ml_false_positive].iloc[:, 0].size / query_train_set.iloc[:, 0].size + FN = keys[ml_false_negative].iloc[:, 0].size / keys.iloc[:, 0].size + + if FP == 0.0: + print("FP = 0, skip") + # filter_opt = learned_bloom_filter.main(classifier_score_path, correct_size_filter, other) + slbf_opt = SLBF(keys, 0, filter_size, threshold) + continue + if FN == 1.0 or FN == 0.0: + # print("FP is equal to 1.0, or FN is equal to 0 or 1, skipping threshold") + continue + if FP + FN > 1: # If FP + FN >= 1, the budget b2 becomes negative + # print("FP + FN >= 1, skipping threshold") + continue + + b2 = FN * math.log(FP / ((1 - FP) * ((1 / FN) - 1)), 0.6185) + b1 = filter_size - b2 + if b1 <= 0: # Non serve avere SLBF + print("b1 = 0") + b1 = 0 + break + + # print(f"FP: {FP}, FN: {FN}, b: {filter_size}, b1: {b1}, b2: {b2}") + + slbf = SLBF(keys, b1, b2, threshold) + fp_items = slbf.query(query_train_set) + # print(f"\tFalse positive items: {fp_items}") + if fp_items < fp_opt: + fp_opt = fp_items + slbf_opt = slbf + # print(f"False positive items: {fp_items} - Current threshold: {threshold}") + if slbf_opt is None: + print("FN + FP >= 1 with all the thresold, is impossible to build a SLBF") + fp_items = slbf_opt.query(query_train_set) + print(f"Chosen thresholds: {slbf_opt.threshold} - False positive items: {fp_items}") + + return slbf_opt, fp_opt + + +def get_slbf_opt(positive_sample, train_negative, R_sum): + b = R_sum / len(positive_sample) + slbf_opt, fp_opt = train_slbf(b, train_negative, positive_sample) + return slbf_opt + + +def run(R_sum, path, model, X_query, y_query, query_urls): + data = pd.read_csv(path) + negative_sample = data.loc[(data['label'] == 0)] + positive_sample = data.loc[(data['label'] == 1)] + # train_negative = negative_sample.sample(frac=0.8) + train_negative = negative_sample + print("start running") + slbf_opt = get_slbf_opt(positive_sample, train_negative, R_sum) + fn = 0 + fp = 0 + cnt_ml = 0 + cnt_bf = 0 + total = len(X_query) + print(f"query count = {total}") + prediction_results = model.predict(X_query) + + for i in range(total): + true_label = y_query[i] + url = query_urls[i] + score = prediction_results[i] + result = slbf_opt.query_single_key(key=url, score=score) + if true_label == 0 and result == 1: + fp += 1 + elif true_label == 1 and result == 0: + fn += 1 + + print(f"fp: {fp}") + print(f"total: {total}") + print(f"fpr: {float(fp) / total}") + print(f"fnr: {float(fn) / total}") + print(f"cnt_ml: {cnt_ml}") + print(f"cnt_bf: {cnt_bf}") + return float(fp) / total + +# if __name__ == '__main__': +# parser = argparse.ArgumentParser() +# parser.add_argument('--data_path', action="store", dest="data_path", type=str, required=True, +# help="path of the dataset") +# parser.add_argument('--size_of_Sandwiched', action="store", dest="R_sum", type=int, required=True, +# help="size of the Ada-BF") +# result = parser.parse_known_args() +# main(result[0].data_path, result[0].R_sum, result[1]) diff --git a/sandwich-lbf/yelp_main.py b/sandwich-lbf/yelp_main.py new file mode 100644 index 0000000..7c22d03 --- /dev/null +++ b/sandwich-lbf/yelp_main.py @@ -0,0 +1,115 @@ +import lightgbm as lgb +import numpy as np +import pandas as pd +import lib.network +import lib.data_processing +import lib.lgb_url +import slbf + +data_train = pd.read_csv('../dataset/yelp/yelp_train.csv') +data_test = pd.read_csv('../dataset/yelp/yelp_test.csv') +data_query = pd.read_csv('../dataset/yelp/yelp_query.csv') + +word_dict, region_dict = lib.data_processing.loading_embedding("yelp") + + +def yelp_embedding(data_train, word_dict=word_dict, region_dict=region_dict): + data_train['keywords'] = data_train['keywords'].str.split(' ') + data_train = data_train.explode('keywords') + data_train = data_train.reset_index(drop=True) + data_train['keywords'] = data_train['keywords'].astype(str) + data_train['keywords'] = data_train['keywords'].apply(str.lower) + + true_num = data_train[data_train['is_in'] == 1].shape[0] + false_num = data_train[data_train['is_in'] == 0].shape[0] + insert = pd.DataFrame() + insert = data_train.apply(lib.network.insert, axis=1) + + # region embedding + data_train['region'] = data_train.apply(lib.network.region_mapping, axis=1, args=(region_dict,)) + data_train.drop(columns=['lat', 'lon'], inplace=True) + + # time embedding + data_train['timestamp'] = data_train['timestamp'].apply(lib.network.time_embedding) + + # keywords embedding + data_train['keywords'] = data_train['keywords'].apply(lib.network.keywords_embedding, args=(word_dict,)) + + # 生成一个用于神经网络输入的dataframe:embedding + embedding = pd.DataFrame() + embedding['embedding'] = data_train.apply(lib.network.to_embedding, axis=1) + # print(embedding) + y = data_train['is_in'] + del data_train + X = pd.DataFrame(embedding['embedding'].apply(pd.Series)) + # print(X) + return X, y, insert, true_num, false_num + + +X_train, y_train, train_insert, train_true, train_false = yelp_embedding(data_train, word_dict=word_dict, + region_dict=region_dict) +X_test, y_test, test_insert, test_true, test_false = yelp_embedding(data_test, word_dict=word_dict, + region_dict=region_dict) +X_query, y_query, query_insert, query_true, query_false = yelp_embedding(data_query, word_dict=word_dict, + region_dict=region_dict) +print(query_insert) + +n_true = train_true + test_true +n_false = test_false + train_false + +n_test = test_true + test_false + +# 清理内存 + + +# 3. 划分训练集和测试集 +X_train = X_train.values.astype(np.float32) +X_test = X_test.values.astype(np.float32) +y_train = y_train.values.astype(np.float32) +y_test = y_test.values.astype(np.float32) +# 4. 创建 LightGBM 数据集 +train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) +test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, free_raw_data=False) +query_data = lgb.Dataset(X_query, label=y_query, free_raw_data=False) + +bst = lgb.Booster(model_file='../best_bst_20480') + +y_pred_train = bst.predict(X_train) +y_pred_test = bst.predict(X_test) +y_pred_query = bst.predict(X_query) + +train_results = pd.DataFrame({ + 'url': train_insert, + 'label': y_train, + 'score': y_pred_train +}) + +test_results = pd.DataFrame({ + 'url': test_insert, + 'label': y_test, + 'score': y_pred_test +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) +query_results = pd.DataFrame({ + 'url': query_insert, + 'label': y_query, + 'score': y_pred_query +}) +all_results = pd.concat([train_results, test_results]) +all_results.to_csv('url_results.csv', index=False) + +# 初始化变量 +model_size = lib.lgb_url.lgb_get_model_size(bst) +print("模型在内存中所占用的大小(字节):", model_size) + +for size in range(64 * 1024, 320 * 1024 + 1, 64 * 1024): + bloom_size = size - model_size + slbf.run( + R_sum=bloom_size * 8, + path='url_results.csv', + model=bst, + X_query=X_query, + y_query=y_query, + query_urls=query_insert + ) diff --git a/test.py b/test.py index 28e60be..65e06da 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,3 @@ -import sys +import model_generate_yelp -# 获取模型对象的内存大小(以字节为单位) -model_memory_usage = sys.getsizeof(model) -print("模型占用的内存大小(字节):", model_memory_usage) +model_generate_yelp.get_model(20 * 1024)