Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 67 additions & 42 deletions helper_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,50 +245,75 @@ def get_signal_names(string):
### Evaluation functions

# Compute the Challenge score.
def compute_challenge_score(labels, outputs, fraction_capacity = 0.05, num_permutations = 10**4, seed=12345):
# Check the data.
assert len(labels) == len(outputs)
num_instances = len(labels)
capacity = int(fraction_capacity * num_instances)

# Convert the data to NumPy arrays, as needed, for easier indexing.
def compute_challenge_score(labels, outputs, fraction_capacity=0.05, tie_tol=0.0, **kwargs):
"""
Exact expected TPR under uniform random tie-breaking at the capacity cutoff.

Parameters
----------
labels : array-like of shape (n,)
Binary labels, expected to be 0/1 (other values are treated as not-equal to 1.0).
outputs : array-like of shape (n,)
Scores used for ranking (higher is better).
fraction_capacity : float, default=0.05
Fraction of instances to select. Capacity C = floor(fraction_capacity * n).
tie_tol : float, default=0.0
Two scores s1, s2 are considered tied if |s1 - s2| <= tie_tol.
Use 0.0 to require exact equality, or a small tolerance (e.g., 1e-12) if desired.
**kwargs : additional keyword arguments (ignored). Included for compatibility with previous version.

Returns
-------
tpr : float
Expected true positive rate over uniform random tie-breaking.
Returns NaN if there are no positive labels.
"""
labels = np.asarray(labels, dtype=np.float64)
outputs = np.asarray(outputs, dtype=np.float64)

# Permute the labels and outputs so that we can approximate the expected confusion matrix for "tied" probabilities.
tp = np.zeros(num_permutations)
fp = np.zeros(num_permutations)
fn = np.zeros(num_permutations)
tn = np.zeros(num_permutations)

if seed is not None:
np.random.seed(seed)

for i in range(num_permutations):
permuted_idx = np.random.permutation(np.arange(num_instances))
permuted_labels = labels[permuted_idx]
permuted_outputs = outputs[permuted_idx]

ordered_idx = np.argsort(permuted_outputs, stable=True)[::-1]
ordered_labels = permuted_labels[ordered_idx]

tp[i] = np.sum(ordered_labels[:capacity] == 1)
fp[i] = np.sum(ordered_labels[:capacity] == 0)
fn[i] = np.sum(ordered_labels[capacity:] == 1)
tn[i] = np.sum(ordered_labels[capacity:] == 0)

tp = np.mean(tp)
fp = np.mean(fp)
fn = np.mean(fn)
tn = np.mean(tn)

# Compute the true positive rate.
if tp + fn > 0:
tpr = tp / (tp + fn)
else:
tpr = float('nan')

return tpr
assert labels.shape == outputs.shape

n = labels.size
capacity = int(np.floor(fraction_capacity * n))
total_positives = float(np.sum(labels == 1.0))

# Handle degenerate cases first
if total_positives == 0.0:
return float('nan') # undefined TPR if no positives
if capacity <= 0:
return 0.0
if capacity >= n:
return 1.0 # everything selected; TP = P

# Sort by score descending (stable so equal scores remain contiguous)
order = np.argsort(outputs, kind='mergesort')[::-1]
scores_sorted = outputs[order]
labels_sorted = labels[order]

# If the boundary between included/excluded does not split a tie, deterministic case
v_incl = scores_sorted[capacity-1]
v_excl = scores_sorted[capacity]
if not np.isclose(v_incl, v_excl, atol=tie_tol):
tp = float(np.sum(labels_sorted[:capacity] == 1.0))
return tp / total_positives

# Tie is split at the boundary: find the contiguous tie-block [start, end)
tie_mask = np.isclose(scores_sorted, v_incl, atol=tie_tol)
tie_idxs = np.where(tie_mask)[0] # contiguous because scores_sorted is sorted
start = int(tie_idxs[0])
end = int(tie_idxs[-1]) + 1 # exclusive
g = end - start # group size
m = capacity - start # number we must take from this group (0 < m < g)

# Deterministic contributions (strictly above the tie block)
pos_before = float(np.sum(labels_sorted[:start] == 1.0))

# Tie-block composition
k = float(np.sum(labels_sorted[start:end] == 1.0)) # positives in tie block

# Exact expectation via Hypergeometric: E[TP_from_tie] = m * (k/g)
expected_tp = pos_before + (m * (k / g))

return expected_tp / total_positives

def compute_auc(labels, outputs):
import sklearn
Expand Down