@@ -9,11 +9,16 @@ Natural Language Generation Performance:
9
9
[ Extractiveness] ( https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks ) :
10
10
11
11
* Extractiveness Coverage:
12
+ - Extent to which a summary is derivative of a text
12
13
- Percentage of words in the summary that are part of an extractive fragment with the article
13
14
* Extractiveness Density:
15
+ - How well the word sequence can be described as series of extractions
16
+ - A summary might contain many individual words from the article and therefore have a high coverage.
17
+ - However, if arranged in a new order, the words of the summary could still be used to convey ideas not present in the article
14
18
- Average length of the extractive fragment to which each word in the summary belongs
15
19
* Extractiveness Compression:
16
20
- Word ratio between the article and the summary
21
+ - Summarizing with higher compression is challenging as it requires capturing more precisely the critical aspects of the article text.
17
22
18
23
API Performance:
19
24
@@ -119,7 +124,7 @@ token_cols = ['Input Token Usage', 'Output Token Usage']
119
124
other_metrics = ['Cost (USD)', 'Duration (s)']
120
125
all_metrics = extractiveness_cols + token_cols + other_metrics
121
126
122
- # Metric histograms by model
127
+ # Metric Histograms by Model
123
128
plt.style.use('default')
124
129
fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
125
130
@@ -144,6 +149,36 @@ for i, metric in enumerate(all_metrics):
144
149
plt.tight_layout()
145
150
plt.show()
146
151
152
+ # Metric Statistics by Model
153
+ for metric in all_metrics:
154
+ print(f"\n{metric.upper()}:")
155
+ desc_stats = df.groupby('MODEL')[metric].agg([
156
+ 'count', 'mean', 'std', 'min', 'median','max'
157
+ ])
158
+
159
+ print(desc_stats)
160
+
161
+
162
+ # Calculate Efficiency Metrics By model
163
+ df_analysis = df.copy()
164
+ df_analysis['Total Token Usage'] = df_analysis['Input Token Usage'] + df_analysis['Output Token Usage']
165
+ df_analysis['Cost per Token'] = df_analysis['Cost (USD)'] / df_analysis['Total Token Usage']
166
+ df_analysis['Tokens per Second'] = df_analysis['Total Token Usage'] / df_analysis['Duration (s)']
167
+ df_analysis['Cost per Second'] = df_analysis['Cost (USD)'] / df_analysis['Duration (s)']
168
+
169
+ efficiency_metrics = ['Cost per Token', 'Tokens per Second', 'Cost per Second']
170
+
171
+ for metric in efficiency_metrics:
172
+ print(f"\n{metric.upper()}:")
173
+ eff_stats = df_analysis.groupby('MODEL')[metric].agg([
174
+ 'count', 'mean', 'std', 'min', 'median', 'max'
175
+ ])
176
+
177
+ for col in ['mean', 'std', 'min', 'median', 'max']:
178
+ eff_stats[col] = eff_stats[col].apply(lambda x: f"{x:.3g}")
179
+ print(eff_stats)
180
+
181
+
147
182
```
148
183
149
184
### Contributing
0 commit comments