1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
|
"""
Alignment models are given a sequence of events along with a piece of audio, and then return a
sequence of timestamps, with one timestamp for each event, indicating the position of this event
in the audio. The events are listed in order of occurrence in the audio, so that output
timestamps have to be monotonically increasing.
Evaluation usually involves taking the series of predicted and ground truth timestamps and
comparing their distance, usually on a pair-wise basis, e.g. taking the median absolute error in
seconds.
Conventions
-----------
Timestamps should be provided in the form of a 1-dimensional array of onset
times in seconds in increasing order.
Metrics
-------
* :func:`mir_eval.alignment.absolute_error`: Median absolute error and average absolute error
* :func:`mir_eval.alignment.percentage_correct`: Percentage of correct timestamps,
where a timestamp is counted
as correct if it lies within a certain tolerance window around the ground truth timestamp
* :func:`mir_eval.alignment.pcs`: Percentage of correct segments: Percentage of overlap between
predicted segments and ground truth segments, where segments are defined by (start time,
end time) pairs
* :func:`mir_eval.alignment.perceptual_metric`: metric based on human synchronicity perception as
measured in the paper "User-centered evaluation of lyrics to audio alignment",
N. Lizé-Masclef, A. Vaglio, M. Moussallam, ISMIR 2021
References
----------
.. [#lizemasclef2021] N. Lizé-Masclef, A. Vaglio, M. Moussallam.
"User-centered evaluation of lyrics to audio alignment",
International Society for Music Information Retrieval (ISMIR) conference,
2021.
.. [#mauch2010] M. Mauch, F: Hiromasa, M. Goto.
"Lyrics-to-audio alignment and phrase-level segmentation using
incomplete internet-style chord annotations",
Frontiers in Proceedings of the Sound Music Computing Conference (SMC), 2010.
.. [#dzhambazov2017] G. Dzhambazov.
"Knowledge-Based Probabilistic Modeling For Tracking Lyrics In Music Audio Signals",
PhD Thesis, 2017.
.. [#fujihara2011] H. Fujihara, M. Goto, J. Ogata, H. Okuno.
"LyricSynchronizer: Automatic synchronization system between musical audio signals and lyrics",
IEEE Journal of Selected Topics in Signal Processing, VOL. 5, NO. 6, 2011
"""
import collections
from typing import Optional
import numpy as np
from scipy.stats import skewnorm
from mir_eval.util import filter_kwargs
def validate(reference_timestamps: np.ndarray, estimated_timestamps: np.ndarray):
"""Check that the input annotations to a metric look like valid onset time
arrays, and throws helpful errors if not.
Parameters
----------
reference_timestamps : np.ndarray
reference timestamp locations, in seconds
estimated_timestamps : np.ndarray
estimated timestamp locations, in seconds
"""
# We need to have 1D numpy arrays
if not isinstance(reference_timestamps, np.ndarray):
raise ValueError(
"Reference timestamps need to be a numpy array, but got"
f" {type(reference_timestamps)}"
)
if not isinstance(estimated_timestamps, np.ndarray):
raise ValueError(
"Estimated timestamps need to be a numpy array, but got"
f" {type(estimated_timestamps)}"
)
if reference_timestamps.ndim != 1:
raise ValueError(
"Reference timestamps need to be a one-dimensional vector, but got"
f" {reference_timestamps.ndim} dimensions"
)
if estimated_timestamps.ndim != 1:
raise ValueError(
"Estimated timestamps need to be a one-dimensional vector, but got"
f" {estimated_timestamps.ndim} dimensions"
)
# If reference or estimated timestamps are empty, cannot compute metric
if reference_timestamps.size == 0:
raise ValueError("Reference timestamps are empty.")
if estimated_timestamps.size != reference_timestamps.size:
raise ValueError(
"Number of timestamps must be the same in prediction and ground"
f" truth, but found {estimated_timestamps.size} in prediction and"
f" {reference_timestamps.size} in ground truth"
)
# Check monotonicity
if not np.all(reference_timestamps[1:] - reference_timestamps[:-1] >= 0):
raise ValueError("Reference timestamps are not monotonically increasing!")
if not np.all(estimated_timestamps[1:] - estimated_timestamps[:-1] >= 0):
raise ValueError("Estimated timestamps are not monotonically increasing!")
# Check positivity (need for correct PCS metric calculation)
if not np.all(reference_timestamps >= 0):
raise ValueError("Reference timestamps can not be below 0!")
if not np.all(estimated_timestamps >= 0):
raise ValueError("Estimated timestamps can not be below 0!")
def absolute_error(reference_timestamps, estimated_timestamps):
"""Compute the absolute deviations between estimated and reference timestamps,
and then returns the median and average over all events
Examples
--------
>>> reference_timestamps = mir_eval.io.load_events('reference.txt')
>>> estimated_timestamps = mir_eval.io.load_events('estimated.txt')
>>> mae, aae = mir_eval.align.absolute_error(reference_onsets, estimated_timestamps)
Parameters
----------
reference_timestamps : np.ndarray
reference timestamps, in seconds
estimated_timestamps : np.ndarray
estimated timestamps, in seconds
Returns
-------
mae : float
Median absolute error
aae: float
Average absolute error
"""
validate(reference_timestamps, estimated_timestamps)
deviations = np.abs(reference_timestamps - estimated_timestamps)
return np.median(deviations), np.mean(deviations)
def percentage_correct(reference_timestamps, estimated_timestamps, window=0.3):
"""Compute the percentage of correctly predicted timestamps. A timestamp is predicted
correctly if its position doesn't deviate more than the window parameter from the ground
truth timestamp.
Examples
--------
>>> reference_timestamps = mir_eval.io.load_events('reference.txt')
>>> estimated_timestamps = mir_eval.io.load_events('estimated.txt')
>>> pc = mir_eval.align.percentage_correct(reference_onsets, estimated_timestamps, window=0.2)
Parameters
----------
reference_timestamps : np.ndarray
reference timestamps, in seconds
estimated_timestamps : np.ndarray
estimated timestamps, in seconds
window : float
Window size, in seconds
(Default value = .3)
Returns
-------
pc : float
Percentage of correct timestamps
"""
validate(reference_timestamps, estimated_timestamps)
deviations = np.abs(reference_timestamps - estimated_timestamps)
return np.mean(deviations <= window)
def percentage_correct_segments(
reference_timestamps, estimated_timestamps, duration: Optional[float] = None
):
"""Calculate the percentage of correct segments (PCS) metric.
It constructs segments out of predicted and estimated timestamps separately
out of each given timestamp vector and calculates the percentage of overlap between correct
segments compared to the total duration.
WARNING: This metrics behaves differently depending on whether "duration" is given!
If duration is not given (default case), the computation follows the MIREX lyrics alignment
challenge 2020. For a timestamp vector with entries (t1,t2, ... tN), segments with
the following (start, end) boundaries are created: (t1, t2), ... (tN-1, tN).
After the segments are created, the overlap between the reference and estimated segments is
determined and divided by the total duration, which is the distance between the
first and last timestamp in the reference.
If duration is given, the segment boundaries are instead (0, t1), (t1, t2), ... (tN, duration).
The overlap is computed in the same way, but then divided by the duration parameter given to
this function.
This method follows the original paper [#fujihara2011] more closely, where the metric was
proposed.
As a result, this variant of the metrics punishes cases where the first estimated timestamp
is too early or the last estimated timestamp is too late, whereas the MIREX variant does not.
On the other hand, the MIREX metric is invariant to how long the eventless beginning and end
parts of the audio are, which might be a desirable property.
Examples
--------
>>> reference_timestamps = mir_eval.io.load_events('reference.txt')
>>> estimated_timestamps = mir_eval.io.load_events('estimated.txt')
>>> pcs = mir_eval.align.percentage_correct_segments(reference_timestamps, estimated_timestamps)
Parameters
----------
reference_timestamps : np.ndarray
reference timestamps, in seconds
estimated_timestamps : np.ndarray
estimated timestamps, in seconds
duration : float
Optional. Total duration of audio (seconds). WARNING: Metric is computed differently
depending on whether this is provided or not - see documentation above!
Returns
-------
pcs : float
Percentage of time where ground truth and predicted segments overlap
"""
validate(reference_timestamps, estimated_timestamps)
if duration is not None:
duration = float(duration)
if duration <= 0:
raise ValueError(
f"Positive duration needs to be provided, but got {duration}"
)
if np.max(reference_timestamps) > duration:
raise ValueError(
"Expected largest reference timestamp"
f"{np.max(reference_timestamps)} to not be "
f"larger than duration {duration}"
)
if np.max(estimated_timestamps) > duration:
raise ValueError(
"Expected largest estimated timestamp "
f"{np.max(estimated_timestamps)} to not be "
f"larger than duration {duration}"
)
ref_starts = np.concatenate([[0], reference_timestamps])
ref_ends = np.concatenate([reference_timestamps, [duration]])
est_starts = np.concatenate([[0], estimated_timestamps])
est_ends = np.concatenate([estimated_timestamps, [duration]])
else:
# MIREX lyrics alignment 2020 style:
# Ignore regions before start and after end reference timestamp
duration = reference_timestamps[-1] - reference_timestamps[0]
if duration <= 0:
raise ValueError(
f"Reference timestamps are all identical, can not compute PCS"
f" metric!"
)
ref_starts = reference_timestamps[:-1]
ref_ends = reference_timestamps[1:]
est_starts = estimated_timestamps[:-1]
est_ends = estimated_timestamps[1:]
overlap_starts = np.maximum(ref_starts, est_starts)
overlap_ends = np.minimum(ref_ends, est_ends)
overlap_duration = np.sum(np.maximum(overlap_ends - overlap_starts, 0))
return overlap_duration / duration
def karaoke_perceptual_metric(reference_timestamps, estimated_timestamps):
"""Metric based on human synchronicity perception as measured in the paper
"User-centered evaluation of lyrics to audio alignment" [#lizemasclef2021]
The parameters of this function were tuned on data collected through a user Karaoke-like
experiment
It reflects human judgment of how "synchronous" lyrics and audio stimuli are perceived
in that setup.
Beware that this metric is non-symmetrical and by construction it is also not equal to 1 at 0.
Examples
--------
>>> reference_timestamps = mir_eval.io.load_events('reference.txt')
>>> estimated_timestamps = mir_eval.io.load_events('estimated.txt')
>>> score = mir_eval.align.karaoke_perceptual_metric(reference_onsets, estimated_timestamps)
Parameters
----------
reference_timestamps : np.ndarray
reference timestamps, in seconds
estimated_timestamps : np.ndarray
estimated timestamps, in seconds
Returns
-------
perceptual_score : float
Perceptual score, averaged over all timestamps
"""
validate(reference_timestamps, estimated_timestamps)
offsets = estimated_timestamps - reference_timestamps
# Score offsets using a certain skewed normal distribution
skewness = 1.12244251
localisation = -0.22270315
scale = 0.29779424
normalisation_factor = 1.6857
perceptual_scores = (1.0 / normalisation_factor) * skewnorm.pdf(
offsets, skewness, loc=localisation, scale=scale
)
return np.mean(perceptual_scores)
def evaluate(reference_timestamps, estimated_timestamps, **kwargs):
"""Compute all metrics for the given reference and estimated annotations.
Examples
--------
>>> reference_timestamps = mir_eval.io.load_events('reference.txt')
>>> estimated_timestamps = mir_eval.io.load_events('estimated.txt')
>>> duration = max(np.max(reference_timestamps), np.max(estimated_timestamps)) + 10
>>> scores = mir_eval.align.evaluate(reference_onsets, estimated_timestamps, duration)
Parameters
----------
reference_timestamps : np.ndarray
reference timestamp locations, in seconds
estimated_timestamps : np.ndarray
estimated timestamp locations, in seconds
**kwargs
Additional keyword arguments which will be passed to the
appropriate metric or preprocessing functions.
Returns
-------
scores : dict
Dictionary of scores, where the key is the metric name (str) and
the value is the (float) score achieved.
"""
# Compute all metrics
scores = collections.OrderedDict()
scores["pc"] = filter_kwargs(
percentage_correct, reference_timestamps, estimated_timestamps, **kwargs
)
scores["mae"], scores["aae"] = absolute_error(
reference_timestamps, estimated_timestamps
)
scores["pcs"] = filter_kwargs(
percentage_correct_segments,
reference_timestamps,
estimated_timestamps,
**kwargs,
)
scores["perceptual"] = karaoke_perceptual_metric(
reference_timestamps, estimated_timestamps
)
return scores
|