Skip to content

Modules

Arcos4py top level module.

This package is a python package for the detection and tracking of collective events intime-series data and raster images.

ARCOS(data, position_columns=['x'], frame_column='time', obj_id_column='id', measurement_column='meas', clid_column='clTrackID', n_jobs=1, **kwargs)

Detects and tracks collective events in a tracked time-series dataset.

Requires binarized measurement column, that can be generated with the bin_measurements method. Tracking makes use of the dbscan algorithm, which is applied to every frame and subsequently connects collective events between frames located within eps distance of each other.

Attributes:

Name Type Description
data DataFrame

Data of tracked time-series in "long format". Can be used to acess modified dataframe at any point.

position_columns list

List containing position column names strings inside data e.g. At least one dimension is required.

frame_column str

Indicating the frame column in input_data.

obj_id_column str

Indicating the track id/id column in input_data.

measurement_column str

Indicating the measurement column in input_data.

clid_column str

Indicating the column name containing the collective event ids.

binarized_measurement_column str | None

Name of the binary column. This is generated based on the name of the measurement_column after binarization. Optionally can be set in order to provide a already binarized column to skip ARCOS binarization.

Parameters:

Name Type Description Default
data DataFrame

Input Data of tracked time-series in "long format" containing position columns, a measurement and an object ID column.

required
position_columns list

List ontaining position column names strings inside data e.g. At least one dimension is required.

['x']
frame_column str

Indicating the frame column in input_data.

'time'
obj_id_column str

Indicating the track id/object id column in input_data. If None, the data is assumed to not have a tracking column. Binarization can only be performed without detrending.

'id'
measurement_column str

Indicating the measurement column in input_data.

'meas'
clid_column str

Indicating the column name containing the collective event ids.

'clTrackID'
n_jobs str

Number of workers to spawn, -1 uses all available cpus.

1
kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - posCols: List containing position column names strings inside data e.g.

{}
Source code in arcos4py/_arcos4py.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def __init__(
    self,
    data: pd.DataFrame,
    position_columns: list = ["x"],
    frame_column: str = 'time',
    obj_id_column: str | None = 'id',
    measurement_column: str = 'meas',
    clid_column: str = 'clTrackID',
    n_jobs: int = 1,
    **kwargs,
) -> None:
    """Constructs class with provided arguments.

    Arguments:
        data (DataFrame): Input Data of tracked time-series in "long format" containing position columns,
            a measurement and an object ID column.
        position_columns (list): List ontaining position column names strings inside data e.g.
            At least one dimension is required.
        frame_column (str): Indicating the frame column in input_data.
        obj_id_column (str): Indicating the track id/object id column in input_data. If None, the data is assumed to
            not have a tracking column. Binarization can only be performed without detrending.
        measurement_column (str): Indicating the measurement column in input_data.
        clid_column (str): Indicating the column name containing the collective event ids.
        n_jobs (str): Number of workers to spawn, -1 uses all available cpus.
        kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - posCols: List containing position column names strings inside data e.g.
    """
    # allowed kwargs
    allowed_kwargs = ["posCols", "id_column"]
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"__init__() got an unexpected keyword argument '{key}'")
    # Handle deprecated parameters
    param_mapping = {
        "posCols": "position_columns",
        "id_column": "obj_id_column",
    }
    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    # Assign updated kwargs to class attributes
    position_columns = updated_kwargs.get("position_columns", position_columns)
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)

    self.data = data
    self.position_columns = position_columns
    self.frame_column = frame_column
    self.obj_id_column = obj_id_column
    self.measurement_column = measurement_column
    self.clid_column = clid_column
    self.n_jobs = n_jobs

    self.binarized_measurement_column: Union[str, None] = None
    # to check if no measurement was provided assign None
    if self.obj_id_column is None:
        self.data = self.data.sort_values(by=[self.frame_column])
    else:
        self.data = self.data.sort_values(by=[self.frame_column, self.obj_id_column])
    self._check_col()
    if self.measurement_column is not None:
        self.resc_col = f"{self.measurement_column}.resc"
        self.binarized_measurement_column = f"{self.measurement_column}.bin"

bin_col: str | None property writable

Return the name of the binarized measurement column.

id_column: str | None property writable

Return the name of the id column.

posCols: list property writable

Return the position columns.

bin_measurements(smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', **kwargs)

Smooth, de-trend, and binarise the input data.

First a short-term median filter with size smoothK is applied to remove fast noise from the time series. If the de-trending method is set to "none", smoothing is applied on globally rescaled time series. The subsequent de-trending can be performed with a long-term median filter with the size biasK {biasMet = "runmed"} or by fitting a polynomial of degree polyDeg {biasMet = "lm"}.

After de-trending, if the global difference between min/max is greater than the threshold the signal is rescaled to the (0,1) range. The final signal is binarised using the binThr threshold

Parameters:

Name Type Description Default
smooth_k int

Size of the short-term median smoothing filter.

3
bias_k int

Size of the long-term de-trending median filter

51
peak_threshold float

Threshold for rescaling of the de-trended signal.

0.2
binarization_threshold float

Threshold for binary classification.

0.1
polynomial_degree int

Sets the degree of the polynomial for lm fitting.

1
bias_method str

De-trending method, one of ['runmed', 'lm', 'none']. If no id_column is provided, only 'none' is allowed.

'runmed'
**kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - smoothK: Size of the short-term median smoothing filter. - biasK: Size of the long-term de-trending median filter - peakThr: Threshold for rescaling of the de-trended signal. - binThr: Threshold for binary classification. - polyDeg: Sets the degree of the polynomial for lm fitting. - biasMet: De-trending method, one of ['runmed', 'lm', 'none'].

{}

Returns:

Type Description
DataFrame

DataFrame with detrended/smoothed and binarized measurement column.

Source code in arcos4py/_arcos4py.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def bin_measurements(
    self,
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    **kwargs,
) -> pd.DataFrame:
    r"""Smooth, de-trend, and binarise the input data.

    First a short-term median filter with size smoothK
    is applied to remove fast noise from the time series.
    If the de-trending method is set to "none",
    smoothing is applied on globally rescaled time series.
    The subsequent de-trending can be performed with a long-term median filter
    with the size biasK {biasMet = "runmed"}
    or by fitting a polynomial of degree polyDeg {biasMet = "lm"}.

    After de-trending,
    if the global difference between min/max is greater than the threshold
    the signal is rescaled to the (0,1) range.
    The final signal is binarised using the binThr threshold

    Arguments:
        smooth_k (int): Size of the short-term median smoothing filter.
        bias_k (int): Size of the long-term de-trending median filter
        peak_threshold (float): Threshold for rescaling of the de-trended signal.
        binarization_threshold (float): Threshold for binary classification.
        polynomial_degree (int): Sets the degree of the polynomial for lm fitting.
        bias_method (str): De-trending method, one of ['runmed', 'lm', 'none'].
            If no id_column is provided, only 'none' is allowed.
        **kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - smoothK: Size of the short-term median smoothing filter.
            - biasK: Size of the long-term de-trending median filter
            - peakThr: Threshold for rescaling of the de-trended signal.
            - binThr: Threshold for binary classification.
            - polyDeg: Sets the degree of the polynomial for lm fitting.
            - biasMet: De-trending method, one of ['runmed', 'lm', 'none'].

    Returns:
        DataFrame with detrended/smoothed and binarized measurement column.
    """
    # allowed kwargs
    param_mapping = {
        "smoothK": "smooth_k",
        "biasK": "bias_k",
        "peakThr": "peak_threshold",
        "binThr": "binarization_threshold",
        "polyDeg": "polynomial_degree",
        "biasMet": "bias_method",
    }
    # allowed kwargs
    allowed_kwargs = param_mapping.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"bin_measurements() got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    smooth_k = updated_kwargs.get("smooth_k", smooth_k)
    bias_k = updated_kwargs.get("bias_k", bias_k)
    peak_threshold = updated_kwargs.get("peak_threshold", peak_threshold)
    binarization_threshold = updated_kwargs.get("binarization_threshold", binarization_threshold)
    polynomial_degree = updated_kwargs.get("polynomial_degree", polynomial_degree)
    bias_method = updated_kwargs.get("bias_method", bias_method)

    self.data = binData(
        smooth_k,
        bias_k,
        peak_threshold,
        binarization_threshold,
        polynomial_degree,
        bias_method,
        n_jobs=self.n_jobs,
    ).run(
        self.data,
        measurement_column=self.measurement_column,
        group_column=self.obj_id_column,
        frame_column=self.frame_column,
    )
    return self.data

clip_meas(clip_low=0.001, clip_high=0.999)

Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

Parameters:

Name Type Description Default
clip_low float

Lower clipping boundary (quantile).

0.001
clip_high float

Upper clipping boundary (quantile).

0.999

Returns:

Type Description
DataFrame

Dataframe with in place clipped measurement column.

Source code in arcos4py/_arcos4py.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def clip_meas(self, clip_low: float = 0.001, clip_high: float = 0.999) -> pd.DataFrame:
    """Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

    Arguments:
        clip_low (float): Lower clipping boundary (quantile).

        clip_high (float): Upper clipping boundary (quantile).

    Returns:
        Dataframe with in place clipped measurement column.
    """
    # Issue a deprecation warning
    warnings.warn(
        "The 'clip_meas' method is deprecated and will be removed in a future version.\
        Please use 'clip_measurements' instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    return self.clip_measurements(clip_low, clip_high)

clip_measurements(clip_low=0.001, clip_high=0.999)

Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

Parameters:

Name Type Description Default
clip_low float

Lower clipping boundary (quantile).

0.001
clip_high float

Upper clipping boundary (quantile).

0.999

Returns:

Type Description
DataFrame

Dataframe with in place clipped measurement column.

Source code in arcos4py/_arcos4py.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def clip_measurements(self, clip_low: float = 0.001, clip_high: float = 0.999) -> pd.DataFrame:
    """Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

    Arguments:
        clip_low (float): Lower clipping boundary (quantile).

        clip_high (float): Upper clipping boundary (quantile).

    Returns:
        Dataframe with in place clipped measurement column.
    """
    meas_column = self.data[self.measurement_column].to_numpy()
    meas_clipped = clipMeas(meas_column).clip(clip_low, clip_high)
    self.data[self.measurement_column] = meas_clipped
    return self.data

interpolate_measurements()

Interpolates NaN's in place in measurement column.

Returns:

Type Description
DataFrame

Dataframe with interpolated measurement column.

Source code in arcos4py/_arcos4py.py
131
132
133
134
135
136
137
138
139
def interpolate_measurements(self) -> pd.DataFrame:
    """Interpolates NaN's in place in measurement column.

    Returns:
        Dataframe with interpolated measurement column.
    """
    meas_interp = interpolation(self.data).interpolate()
    self.data = meas_interp
    return self.data

trackCollev(eps=1, eps_prev=None, min_clustersize=1, n_prev=1, clustering_method='dbscan', linking_method='nearest', min_samples=None, **kwargs)

Detects and tracks collective events in a tracked time-series dataset.

Makes use of the dbscan algorithm, applies this to every timeframe and subsequently connects collective events between frames located within eps distance of each other.

Parameters:

Name Type Description Default
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other. This is not a maximum bound on the distances of points within a cluster.

1
eps_prev float | None

Frame to frame distance, value is used to connect collective events across multiple frames.If "None", same value as eps is used.

None
min_clustersize int

The minimum size for a cluster to be identified as a collective event

1
n_prev int

Number of previous frames the tracking algorithm looks back to connect collective events

1
clustering_method str

Clustering method, one of ['dbscan', 'hdbscan'].

'dbscan'
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize.

None
linking_method str

Linking method, one of ['nearest', 'transportation'].

'nearest'
**kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - epsPrev: Frame to frame distance, value is used to connect collective events across multiple frames. - minClsz: The minimum size for a cluster to be identified as a collective event - nPrev: Number of previous frames the tracking algorithm looks back to connect collective events - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan']. - minSamples: The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize. - linkingMethod: Linking method, one of ['nearest', 'transportation'].

{}

Returns:

Type Description
DataFrame

DataFrame with detected collective events across time.

Source code in arcos4py/_arcos4py.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
def trackCollev(
    self,
    eps: float = 1,
    eps_prev: Union[float, None] = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    clustering_method: str = "dbscan",
    linking_method: str = "nearest",
    min_samples: int | None = None,
    **kwargs,
) -> pd.DataFrame:
    """Detects and tracks collective events in a tracked time-series dataset.

    Makes use of the dbscan algorithm,
    applies this to every timeframe and subsequently connects
    collective events between frames located within eps distance of each other.

    Arguments:
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
            This is not a maximum bound on the distances of points within a cluster.
        eps_prev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames.If "None", same value as eps is used.
        min_clustersize (int): The minimum size for a cluster to be identified as a collective event
        n_prev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events
        clustering_method (str): Clustering method, one of ['dbscan', 'hdbscan'].
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
        linking_method (str): Linking method, one of ['nearest', 'transportation'].
        **kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - epsPrev: Frame to frame distance, value is used to connect
                collective events across multiple frames.
            - minClsz: The minimum size for a cluster to be identified as a collective event
            - nPrev: Number of previous frames the tracking
                algorithm looks back to connect collective events
            - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan'].
            - minSamples: The number of samples (or total weight) in a neighbourhood for a
                point to be considered as a core point. This includes the point itself.
                Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
            - linkingMethod: Linking method, one of ['nearest', 'transportation'].

    Returns:
        DataFrame with detected collective events across time.
    """
    warnings.warn(
        "The 'trackCollev' method is deprecated and will be removed in a future version.\
            Please use 'track_collective_events' instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    return self.track_collective_events(
        eps, eps_prev, min_clustersize, n_prev, clustering_method, linking_method, min_samples, **kwargs
    )

track_collective_events(eps=1, eps_prev=None, min_clustersize=1, n_prev=1, clustering_method='dbscan', linking_method='nearest', min_samples=None, **kwargs)

Detects and tracks collective events in a tracked time-series dataset.

Makes use of the dbscan algorithm, applies this to every timeframe and subsequently connects collective events between frames located within eps distance of each other.

Parameters:

Name Type Description Default
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other. This is not a maximum bound on the distances of points within a cluster.

1
eps_prev float | None

Frame to frame distance, value is used to connect collective events across multiple frames.If "None", same value as eps is used.

None
min_clustersize int

The minimum size for a cluster to be identified as a collective event

1
n_prev int

Number of previous frames the tracking algorithm looks back to connect collective events

1
clustering_method str

Clustering method, one of ['dbscan', 'hdbscan'].

'dbscan'
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize.

None
linking_method str

Linking method, one of ['nearest', 'transportation'].

'nearest'
**kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - epsPrev: Frame to frame distance, value is used to connect collective events across multiple frames. - minClsz: The minimum size for a cluster to be identified as a collective event - nPrev: Number of previous frames the tracking algorithm looks back to connect collective events - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan']. - minSamples: The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize. - linkingMethod: Linking method, one of ['nearest', 'transportation'].

{}

Returns:

Type Description
DataFrame

DataFrame with detected collective events across time.

Source code in arcos4py/_arcos4py.py
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
def track_collective_events(
    self,
    eps: float = 1,
    eps_prev: Union[float, None] = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    clustering_method: str = "dbscan",
    linking_method: str = "nearest",
    min_samples: int | None = None,
    **kwargs,
) -> pd.DataFrame:
    """Detects and tracks collective events in a tracked time-series dataset.

    Makes use of the dbscan algorithm,
    applies this to every timeframe and subsequently connects
    collective events between frames located within eps distance of each other.

    Arguments:
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
            This is not a maximum bound on the distances of points within a cluster.
        eps_prev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames.If "None", same value as eps is used.
        min_clustersize (int): The minimum size for a cluster to be identified as a collective event
        n_prev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events
        clustering_method (str): Clustering method, one of ['dbscan', 'hdbscan'].
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
        linking_method (str): Linking method, one of ['nearest', 'transportation'].
        **kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - epsPrev: Frame to frame distance, value is used to connect
                collective events across multiple frames.
            - minClsz: The minimum size for a cluster to be identified as a collective event
            - nPrev: Number of previous frames the tracking
                algorithm looks back to connect collective events
            - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan'].
            - minSamples: The number of samples (or total weight) in a neighbourhood for a
                point to be considered as a core point. This includes the point itself.
                Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
            - linkingMethod: Linking method, one of ['nearest', 'transportation'].

    Returns:
        DataFrame with detected collective events across time.
    """
    param_mapping = {
        "epsPrev": "eps_prev",
        "minClsz": "min_clustersize",
        "nPrev": "n_prev",
        "clusteringMethod": "clustering_method",
        "minSamples": "min_samples",
        "linkingMethod": "linking_method",
    }
    # allowed kwargs
    allowed_kwargs = param_mapping.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"track_collective_events() got an unexpected keyword argument '{key}'")
    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    eps_prev = updated_kwargs.get("eps_prev", eps_prev)
    min_clustersize = updated_kwargs.get("min_clustersize", min_clustersize)
    n_prev = updated_kwargs.get("n_prev", n_prev)
    clustering_method = updated_kwargs.get("clustering_method", clustering_method)
    min_samples = updated_kwargs.get("min_samples", min_samples)
    linking_method = updated_kwargs.get("linking_method", linking_method)

    data_events = track_events_dataframe(
        X=self.data,
        position_columns=self.position_columns,
        frame_column=self.frame_column,
        id_column=self.obj_id_column,
        binarized_measurement_column=self.binarized_measurement_column,
        eps=eps,
        eps_prev=eps_prev,
        min_clustersize=min_clustersize,
        n_prev=n_prev,
        clid_column=self.clid_column,
        linking_method=linking_method,
        clustering_method=clustering_method,
        min_samples=min_samples,
        n_jobs=self.n_jobs,
    )

    return data_events

track_events_dataframe(X, position_columns, frame_column, id_column=None, binarized_measurement_column=None, clid_column='collid', eps=1.0, eps_prev=None, min_clustersize=3, min_samples=None, clustering_method='dbscan', linking_method='nearest', allow_merges=False, allow_splits=False, stability_threshold=10, remove_small_clusters=False, min_size_for_split=1, reg=1, reg_m=10, cost_threshold=0, n_prev=1, predictor=False, n_jobs=1, show_progress=True, **kwargs)

Function to track collective events in a dataframe.

Parameters:

Name Type Description Default
X DataFrame

The input dataframe containing the data to track.

required
position_columns List[str]

The names of the columns representing coordinates.

required
frame_column str

The name of the column containing frame ids.

required
id_column str | None

The name of the column representing IDs. None if no such column.

None
binarized_measurement_column str | None

The name of the column representing binarized measurements, if None all measurements are used.

None
clid_column str

The name of the output column representing collective events, will be generated.

'collid'
eps float

Maximum distance for clustering, default is 1.

1.0
eps_prev float | None

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

None
min_clustersize int

Minimum cluster size. Default is 3.

3
min_samples int

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
linking_method str

The method used for linking, one of ['nearest', 'transportsolver']. Default is 'nearest'.

'nearest'
allow_merges bool

Whether or not to allow merges. Default is False.

False
allow_splits bool

Whether or not to allow splits. Default is False.

False
stability_threshold int

Number of frames to consider for stability. Default is 10.

10
remove_small_clusters bool

Whether or not to remove small clusters. Default is False.

False
min_size_for_split int

Minimum size for a split. Default is 1.

1
reg float

Regularization parameter for transportation solver. Default is 1.

1
reg_m float

Regularization parameter for transportation solver. Default is 10.

10
cost_threshold float

Cost threshold for transportation solver. Default is 0.

0
n_prev int

Number of previous frames to consider. Default is 1.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
n_jobs int

Number of jobs to run in parallel. Default is 1.

1
show_progress bool

Whether or not to show progress bar. Default is True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead. - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: Dataframe with tracked events.

Source code in arcos4py/tools/_detect_events.py
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
def track_events_dataframe(
    X: pd.DataFrame,
    position_columns: List[str],
    frame_column: str,
    id_column: str | None = None,
    binarized_measurement_column: str | None = None,
    clid_column: str = "collid",
    eps: float = 1.0,
    eps_prev: float | None = None,
    min_clustersize: int = 3,
    min_samples: int | None = None,
    clustering_method: str = "dbscan",
    linking_method: str = 'nearest',
    allow_merges: bool = False,
    allow_splits: bool = False,
    stability_threshold: int = 10,
    remove_small_clusters: bool = False,
    min_size_for_split: int = 1,
    reg: float = 1,
    reg_m: float = 10,
    cost_threshold: float = 0,
    n_prev: int = 1,
    predictor: bool | Callable = False,
    n_jobs: int = 1,
    show_progress: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """Function to track collective events in a dataframe.

    Arguments:
        X (pd.DataFrame): The input dataframe containing the data to track.
        position_columns (List[str]): The names of the columns representing coordinates.
        frame_column (str): The name of the column containing frame ids.
        id_column (str | None): The name of the column representing IDs. None if no such column.
        binarized_measurement_column (str | None): The name of the column representing binarized measurements,
            if None all measurements are used.
        clid_column (str): The name of the output column representing collective events, will be generated.
        eps (float): Maximum distance for clustering, default is 1.
        eps_prev (float | None): Maximum distance for linking previous clusters, if None, eps is used. Default is None.
        min_clustersize (int): Minimum cluster size. Default is 3.
        min_samples (int): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        linking_method (str): The method used for linking, one of ['nearest', 'transportsolver']. Default is 'nearest'.
        allow_merges (bool): Whether or not to allow merges. Default is False.
        allow_splits (bool): Whether or not to allow splits. Default is False.
        stability_threshold (int): Number of frames to consider for stability. Default is 10.
        remove_small_clusters (bool): Whether or not to remove small clusters. Default is False.
        min_size_for_split (int): Minimum size for a split. Default is 1.
        reg (float): Regularization parameter for transportation solver. Default is 1.
        reg_m (float): Regularization parameter for transportation solver. Default is 10.
        cost_threshold (float): Cost threshold for transportation solver. Default is 0.
        n_prev (int): Number of previous frames to consider. Default is 1.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        n_jobs (int): Number of jobs to run in parallel. Default is 1.
        show_progress (bool): Whether or not to show progress bar. Default is True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
            - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

    Returns:
        pd.DataFrame: Dataframe with tracked events.
    """
    map_params = {
        "coordinates_column": "position_columns",
        "bin_meas_column": "binarized_measurement_column",
        "collid_column": "clid_column",
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
        'showProgress': 'show_progress',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    linking_method = kwargs.get('linking_method', linking_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    linker = Linker(
        eps=eps,
        eps_prev=eps_prev,
        min_clustersize=min_clustersize,
        min_samples=min_samples,
        clustering_method=clustering_method,
        linking_method=linking_method,
        n_prev=n_prev,
        predictor=predictor,
        n_jobs=n_jobs,
        allow_merges=allow_merges,
        allow_splits=allow_splits,
        stability_threshold=stability_threshold,
        remove_small_clusters=remove_small_clusters,
        min_size_for_split=min_size_for_split,
        reg=reg,
        reg_m=reg_m,
        cost_threshold=cost_threshold,
    )

    tracker = DataFrameTracker(
        linker=linker,
        position_columns=position_columns,
        frame_column=frame_column,
        obj_id_column=id_column,
        binarized_measurement_column=binarized_measurement_column,
        clid_column=clid_column,
    )
    df_out = pd.concat(
        [timepoint for timepoint in tqdm(tracker.track(X), total=X[frame_column].nunique(), disable=not show_progress)]
    ).reset_index(drop=True)

    if any([allow_merges, allow_splits]):
        return df_out.query(f"{clid_column} != -1").reset_index(drop=True), linker.lineage_tracker
    return df_out.query(f"{clid_column} != -1").reset_index(drop=True)

track_events_image(X, eps=1, eps_prev=None, min_clustersize=1, min_samples=None, clustering_method='dbscan', n_prev=1, predictor=False, linking_method='nearest', allow_merges=False, allow_splits=False, stability_threshold=10, remove_small_clusters=False, min_size_for_split=1, reg=1, reg_m=10, cost_threshold=0, dims='TXY', downsample=1, n_jobs=1, show_progress=True, **kwargs)

Function to track events in an image using specified linking and clustering methods.

Parameters:

Name Type Description Default
X ndarray

The input array containing the images to track.

required
eps float

Distance for clustering. Default is 1.

1
eps_prev float | None

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

None
min_clustersize int

Minimum cluster size. Default is 1.

1
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
n_prev int

Number of previous frames to consider. Default is 1.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
linking_method str

The method used for linking. Default is 'nearest'.

'nearest'
allow_merges bool

Whether or not to allow merges. Default is False.

False
allow_splits bool

Whether or not to allow splits. Default is False.

False
stability_threshold int

The number of frames required for a stable merge or split. Default is 10.

10
remove_small_clusters bool

Whether or not to remove small clusters. Default is False.

False
min_size_for_split int

Minimum size for a split. Default is 1.

1
reg float

Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).

1
reg_m float

Marginal relaxation parameter for unbalanced OT (only for transportation linking).

10
cost_threshold float

Threshold for filtering low-probability matches (only for transportation linking).

0
dims str

String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".

'TXY'
downsample int

Factor by which to downsample the image. Default is 1.

1
n_jobs int

Number of jobs to run in parallel. Default is 1.

1
show_progress bool

Whether or not to show progress bar. Default is True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead. - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

{}

Returns:

Type Description
ndarray | tuple[ndarray, LineageTracker]

np.ndarray: Array of images with tracked events.

Source code in arcos4py/tools/_detect_events.py
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
def track_events_image(
    X: np.ndarray,
    eps: float = 1,
    eps_prev: float | None = None,
    min_clustersize: int = 1,
    min_samples: int | None = None,
    clustering_method: str = "dbscan",
    n_prev: int = 1,
    predictor: bool | Callable = False,
    linking_method: str = 'nearest',
    allow_merges: bool = False,
    allow_splits: bool = False,
    stability_threshold: int = 10,
    remove_small_clusters: bool = False,
    min_size_for_split: int = 1,
    reg: float = 1,
    reg_m: float = 10,
    cost_threshold: float = 0,
    dims: str = "TXY",
    downsample: int = 1,
    n_jobs: int = 1,
    show_progress: bool = True,
    **kwargs,
) -> np.ndarray | tuple[np.ndarray, LineageTracker]:
    """Function to track events in an image using specified linking and clustering methods.

    Arguments:
        X (np.ndarray): The input array containing the images to track.
        eps (float): Distance for clustering. Default is 1.
        eps_prev (float | None): Maximum distance for linking previous clusters, if None, eps is used. Default is None.
        min_clustersize (int): Minimum cluster size. Default is 1.
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        n_prev (int): Number of previous frames to consider. Default is 1.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        linking_method (str): The method used for linking. Default is 'nearest'.
        allow_merges (bool): Whether or not to allow merges. Default is False.
        allow_splits (bool): Whether or not to allow splits. Default is False.
        stability_threshold (int): The number of frames required for a stable merge or split. Default is 10.
        remove_small_clusters (bool): Whether or not to remove small clusters. Default is False.
        min_size_for_split (int): Minimum size for a split. Default is 1.
        reg (float): Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).
        reg_m (float): Marginal relaxation parameter for unbalanced OT (only for transportation linking).
        cost_threshold (float): Threshold for filtering low-probability matches (only for transportation linking).
        dims (str): String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".
        downsample (int): Factor by which to downsample the image. Default is 1.
        n_jobs (int): Number of jobs to run in parallel. Default is 1.
        show_progress (bool): Whether or not to show progress bar. Default is True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
            - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

    Returns:
        np.ndarray: Array of images with tracked events.
    """
    map_params = {
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
        'showProgress': 'show_progress',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    linking_method = kwargs.get('linking_method', linking_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    # Determine the dimensionality
    spatial_dims = set("XYZ")
    D = len([d for d in dims if d in spatial_dims])

    # Adjust parameters based on dimensionality
    adjusted_epsPrev = eps_prev / downsample if eps_prev is not None else None
    adjusted_minClSz = int(min_clustersize / (downsample**D))
    adjusted_minSamples = int(min_samples / (downsample**D)) if min_samples is not None else None

    linker = Linker(
        eps=eps / downsample,
        eps_prev=adjusted_epsPrev,
        min_clustersize=adjusted_minClSz,
        min_samples=adjusted_minSamples,
        clustering_method=clustering_method,
        linking_method=linking_method,
        n_prev=n_prev,
        predictor=predictor,
        reg=reg,
        reg_m=reg_m,
        cost_threshold=cost_threshold,
        n_jobs=n_jobs,
        allow_merges=allow_merges,
        allow_splits=allow_splits,
        stability_threshold=stability_threshold,
        remove_small_clusters=remove_small_clusters,
        min_size_for_split=min_size_for_split,
    )
    tracker = ImageTracker(linker, downsample=downsample)
    # find indices of T in dims
    T_index = dims.upper().index("T")
    out = np.zeros_like(X, dtype=np.uint16)

    for i in tqdm(range(X.shape[T_index]), disable=not show_progress):
        out[i] = tracker.track_iteration(X[i])

    if any([allow_merges, allow_splits]):
        return out, linker.lineage_tracker

    return out

plotting

Tools for plotting collective events.

LineagePlot(figsize=(18, 18), node_size=50, edge_width=2, edge_alpha=0.8, color_seed=42, title='Cluster Lineage Tree', xlabel='Frame', ylabel='Lineage', font_size=16, curve_factor=0.9, orphan_color=(0.7, 0.7, 0.7, 1.0), color_by='lineage_id', show_node_labels=False, main_lineage_id=None)

Class to draw a lineage tree of clusters over time.

Attributes:

Name Type Description
figsize tuple

Size of the figure.

node_size int

Size of the nodes.

edge_width int

Width of the edges.

edge_alpha float

Alpha value of the edges.

color_seed int

Seed for the color generation.

title str

Title of the plot.

xlabel str

Label of the x-axis.

ylabel str

Label of the y-axis.

font_size int

Font size of the labels.

curve_factor float

Factor to curve the edges.

orphan_color tuple

Color of the orphan nodes.

color_by str

Attribute to color the plot by ('lineage_id' or 'cluster_id').

show_node_labels bool

If True, display node labels on the plot.

main_lineage_id int

The lineage ID of the main lineage to be plotted on the same row.

Source code in arcos4py/plotting/_plotting.py
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
def __init__(
    self,
    figsize=(18, 18),
    node_size=50,
    edge_width=2,
    edge_alpha=0.8,
    color_seed=42,
    title="Cluster Lineage Tree",
    xlabel="Frame",
    ylabel="Lineage",
    font_size=16,
    curve_factor=0.9,
    orphan_color=(0.7, 0.7, 0.7, 1.0),
    color_by='lineage_id',  # 'lineage_id' or 'cluster_id'
    show_node_labels=False,  # Whether to display node labels
    main_lineage_id=None,  # The main lineage to keep on the same row
):
    """Constructs class with given parameters."""
    self.fig, self.ax = plt.subplots(figsize=figsize)
    self.node_size = node_size
    self.edge_width = edge_width
    self.edge_alpha = edge_alpha
    self.title = title
    self.xlabel = xlabel
    self.ylabel = ylabel
    self.font_size = font_size
    self.curve_factor = curve_factor
    self.orphan_color = orphan_color
    self.color_seed = color_seed
    self.color_by = color_by
    self.show_node_labels = show_node_labels
    self.main_lineage_id = main_lineage_id  # Store the main lineage ID

    self.colors: Dict[int, Tuple[float, float, float, float]] = {}
    self.node_positions: Dict[Tuple[int, int], Tuple[float, float]] = {}
    self.lineage_edges: List[Tuple[Tuple[int, int], Tuple[int, int]]] = []
    self.node_color: Dict[Tuple[int, int], Tuple[float, float, float, float]] = {}
    self.child_to_parent: Dict[Tuple[int, int], Set[Tuple[int, int]]] = {}
    self.parent_to_child: Dict[Tuple[int, int], Set[Tuple[int, int]]] = {}
    self.lineage_order: List[int] = []
    self.all_nodes: Set[Tuple[int, int]] = set()
    self.frame_to_nodes: Dict[int, List[Tuple[int, int]]] = defaultdict(list)
    self.node_to_lineage_id: Dict[Tuple[int, int], int] = {}
    self.node_to_cluster_id: Dict[Tuple[int, int], int] = {}
    self.node_to_plot_lineage_id: Dict[Tuple[int, int], int] = {}
    self.plot_lineage_id_to_lineage_id: Dict[int, int] = {}  # New mapping

draw_tree(tracker)

Draw the lineage tree based on the processed data.

Source code in arcos4py/plotting/_plotting.py
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
def draw_tree(self, tracker):
    """Draw the lineage tree based on the processed data."""
    self._process_data(tracker)

    # Draw edges with assigned coloring
    for source, target in self.lineage_edges:
        source_pos = self.node_positions[source]
        target_pos = self.node_positions[target]
        color = self.node_color.get(source, self.orphan_color)
        self._draw_curved_edge(source_pos, target_pos, color)

    # Draw nodes
    for node in self.minframe_nodes:
        pos = self.node_positions[node]
        color = self.node_color.get(node, self.orphan_color)
        self.ax.scatter(pos[0], pos[1], s=self.node_size, c=[color], zorder=2)
        if self.show_node_labels:
            label = f"{node[1]}"  # Use cluster_id as label
            self.ax.text(pos[0], pos[1], label, fontsize=self.font_size - 4, ha='right', va='bottom')

    # Set labels and title
    self.ax.set_title(self.title, fontsize=self.font_size)
    self.ax.set_xlabel(self.xlabel, fontsize=self.font_size - 2)
    self.ax.set_ylabel(self.ylabel, fontsize=self.font_size - 2)

    # Set y-ticks to plotting lineage IDs with true lineage IDs as labels
    lineage_ids = self.lineage_order
    plot_lineage_id_to_y = {lineage_id: idx for idx, lineage_id in enumerate(lineage_ids)}
    max_idx = len(lineage_ids) - 1 if len(lineage_ids) > 0 else 1
    y_ticks = [plot_lineage_id_to_y[lineage_id] / max_idx if max_idx > 0 else 0.5 for lineage_id in lineage_ids]
    # Get corresponding true lineage IDs for labels
    y_tick_labels = [self.plot_lineage_id_to_lineage_id[lineage_id] for lineage_id in lineage_ids]
    self.ax.set_yticks(y_ticks)
    self.ax.set_yticklabels(y_tick_labels, fontsize=self.font_size - 4)
    plt.tight_layout()

show()

Display the plot.

Source code in arcos4py/plotting/_plotting.py
1059
1060
1061
def show(self):
    """Display the plot."""
    plt.show()

NoodlePlot(df, clid_column='collid', obj_id_column='obj_id', frame_column='frame', posx='x', posy='y', posz=None, **kwargs)

Create Noodle Plot of cell tracks, colored by collective event id.

Attributes:

Name Type Description
df DataFrame

DataFrame containing collective events from arcos.

colev str

Name of the collective event column in df.

trackid str

Name of the track column in df.

frame str

Name of the frame column in df.

posx str

Name of the X coordinate column in df.

posy str

Name of the Y coordinate column in df.

posz str

Name of the Z coordinate column in df, or None if no z column.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing collective events from arcos.

required
clid_column str

Name of the collective event column in df.

'collid'
obj_id_column str

Name of the track column in df.

'obj_id'
frame_column str

Name of the frame column in df.

'frame'
posx str

Name of the X coordinate column in df.

'x'
posy str

Name of the Y coordinate column in df.

'y'
posz str | None

Name of the Z coordinate column in df, or None if no z column.

None
**kwargs Any

Additional keyword arguments for plot. Includes deprecated parameters. - colev (str): Deprecated. Use clid_column instead. - trackid (str): Deprecated. Use obj_id_column instead. - frame (str): Deprecated. Use frame_column instead.

{}
Source code in arcos4py/plotting/_plotting.py
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
def __init__(
    self,
    df: pd.DataFrame,
    clid_column: str = "collid",
    obj_id_column: str = "obj_id",
    frame_column: str = "frame",
    posx: str = "x",
    posy: str = "y",
    posz: Union[str, None] = None,
    **kwargs,
):
    """Constructs class with given parameters.

    Arguments:
        df (pd.DataFrame): DataFrame containing collective events from arcos.
        clid_column (str): Name of the collective event column in df.
        obj_id_column (str): Name of the track column in df.
        frame_column (str): Name of the frame column in df.
        posx (str): Name of the X coordinate column in df.
        posy (str): Name of the Y coordinate column in df.
        posz (str | None): Name of the Z coordinate column in df,
            or None if no z column.
        **kwargs (Any): Additional keyword arguments for plot. Includes deprecated parameters.
            - colev (str): Deprecated. Use clid_column instead.
            - trackid (str): Deprecated. Use obj_id_column instead.
            - frame (str): Deprecated. Use frame_column instead.
    """
    map_deprecated_params = {
        "colev": "clid_column",
        "trackid": "obj_id_column",
        "frame": "frame_column",
    }

    # allowed matplotlib kwargs
    allowed_kwargs = [
        "alpha",
        "animated",
        "c",
        "label",
        "linewidth",
        "linestyle",
        "marker",
        "markersize",
        "markeredgecolor",
        "markerfacecolor",
        "markerfacecoloralt",
        "markeredgewidth",
        "path_effects",
        "picker",
        "pickradius",
        "solid_capstyle",
        "solid_joinstyle",
        "transform",
        "visible",
        "zorder",
    ]

    # check allowed kwargs
    allowed_kwargs_2 = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs and key not in allowed_kwargs_2:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.pop("clid_column", clid_column)
    obj_id_column = updated_kwargs.pop("obj_id_column", obj_id_column)
    frame_column = updated_kwargs.pop("frame_column", frame_column)

    self.df = df
    self.clid_column = clid_column
    self.obj_id_column = obj_id_column
    self.frame_column = frame_column
    self.posx = posx
    self.posy = posy
    self.posz = posz
    self.plot_kwargs = updated_kwargs

plot(projection_axis, color_cylce=TAB20)

Create Noodle Plot of cell tracks, colored by collective event id.

Parameters:

Name Type Description Default
projection_axis str

Specify with witch coordinate the noodle plot should be drawn. Has to be one of the posx, posy or posz arguments passed in during the class instantiation process.

required
color_cylce list[str]

List of hex color values or string names (i.e. ['red', 'yellow']) used to color collecitve events. Cycles through list.

TAB20

Returns:

Name Type Description
fig Figure

Matplotlib figure object for the noodle plot.

axes Axes

Matplotlib axes for the nooble plot.

Source code in arcos4py/plotting/_plotting.py
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
def plot(self, projection_axis: str, color_cylce: list[str] = TAB20):
    """Create Noodle Plot of cell tracks, colored by collective event id.

    Arguments:
        projection_axis (str): Specify with witch coordinate the noodle
            plot should be drawn. Has to be one of the posx, posy or posz arguments
            passed in during the class instantiation process.
        color_cylce (list[str]): List of hex color values or string names
            (i.e. ['red', 'yellow']) used to color collecitve events. Cycles through list.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object for the noodle plot.
        axes (matplotlib.axes.Axes): Matplotlib axes for the nooble plot.
    """
    if projection_axis not in [self.posx, self.posy, self.posz]:
        raise ValueError(f"projection_axis has to be one of {[self.posx, self.posy, self.posz]}")
    if projection_axis == self.posx:
        self.projection_index = 3
    elif projection_axis == self.posy:
        self.projection_index = 4
    elif projection_axis == self.posz:
        self.projection_index = 5
    if self.df.empty:
        grpd_data: list[np.ndarray] = []
        colors: np.ndarray = np.array([])
    else:
        grpd_data, colors = self._prepare_data_noodleplot(
            self.df,
            color_cylce,
            self.clid_column,
            self.obj_id_column,
            self.frame_column,
            self.posx,
            self.posy,
            self.posz,
        )
    fig, axes = self._create_noodle_plot(grpd_data, colors)
    return fig, axes

dataPlots(data, frame_column='frame', measurement_column='m', obj_id_column='obj_id', **kwargs)

Plot different metrics of input data.

Attributes:

Name Type Description
data Dataframe

containing ARCOS data.

frame_column str

name of frame column in data.

measurement_column str

name of measurement column in data.

obj_id_column str

name of track id column.

Parameters:

Name Type Description Default
data Dataframe

containing ARCOS data.

required
frame_column str

name of frame column in data.

'frame'
measurement_column str

name of measurement column in data.

'm'
obj_id_column str

name of track id column.

'obj_id'
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - id (str): Deprecated. Use obj_id_column instead. - frame (str): Deprecated. Use frame_column instead. - measurement (str): Deprecated. Use measurement_column instead.

{}
Source code in arcos4py/plotting/_plotting.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def __init__(
    self,
    data: pd.DataFrame,
    frame_column: str = 'frame',
    measurement_column: str = 'm',
    obj_id_column: str = 'obj_id',
    **kwargs,
):
    """Plot different metrics such as histogram, position-t and density.

    Arguments:
        data (Dataframe): containing ARCOS data.
        frame_column (str): name of frame column in data.
        measurement_column (str): name of measurement column in data.
        obj_id_column (str): name of track id column.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - id (str): Deprecated. Use obj_id_column instead.
            - frame (str): Deprecated. Use frame_column instead.
            - measurement (str): Deprecated. Use measurement_column instead.
    """
    map_deprecated_params = {
        "id": "obj_id_column",
        "frame": "frame_column",
        "measurement": "measurement_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    frame_column = updated_kwargs.get("frame_column", frame_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)

    self.data = data
    self.obj_id = obj_id_column
    self.frame_column = frame_column
    self.measurement_column = measurement_column

density_plot(*args, **kwargs)

Density plot of measurement.

Uses Seaborn distplot to plot measurement density.

Parameters:

Name Type Description Default
*args Any

arguments passed on to seaborn histplot function.

()
**kwargs Any

keyword arguments passed on to seaborn histplot function.

{}

Returns:

Name Type Description
FacetGrid FacetGrid

Seaborn FacetGrid of density density plot.

Source code in arcos4py/plotting/_plotting.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def density_plot(self, *args, **kwargs):
    """Density plot of measurement.

    Uses Seaborn distplot to plot measurement density.

    Arguments:
        *args (Any): arguments passed on to seaborn histplot function.
        **kwargs (Any): keyword arguments passed on to seaborn histplot function.

    Returns:
        FacetGrid (seaborn.FacetGrid): Seaborn FacetGrid of density density plot.
    """
    plot = sns.displot(
        self.data[self.measurement_column],
        kind="kde",
        palette="pastel",
        label=self.measurement_column,
        *args,
        **kwargs,
    )
    # Plot formatting
    plt.legend(prop={'size': 10})
    plt.title('Density Plot of Measurement')
    plt.xlabel('Measurement')
    plt.ylabel('Density')
    return plot

histogram(bins='auto', *args, **kwargs)

Histogram of tracklenght.

Uses seaborn histplot function to plot tracklenght histogram.

Parameters:

Name Type Description Default
bins str

number or width of bins in histogram

'auto'
*args Any

arguments passed on to seaborn histplot function.

()
**kwargs Any

keyword arguments passed on to seaborn histplot function.

{}

Returns:

Name Type Description
AxesSubplot Axes

Matplotlib AxesSubplot of histogram.

Source code in arcos4py/plotting/_plotting.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def histogram(self, bins: str = 'auto', *args, **kwargs) -> plt.Axes:
    """Histogram of tracklenght.

    Uses seaborn histplot function to plot tracklenght histogram.

    Arguments:
        bins (str): number or width of bins in histogram
        *args (Any): arguments passed on to seaborn histplot function.
        **kwargs (Any): keyword arguments passed on to seaborn histplot function.

    Returns:
        AxesSubplot: Matplotlib AxesSubplot of histogram.
    """
    # Draw histogram
    track_length = self.data.groupby(self.obj_id).size()
    axes = sns.histplot(track_length, label="Track Length", bins=bins, *args, **kwargs)
    # Plot formatting
    plt.title('Track length Histogram')
    axes.set_xlabel('Track Length')
    axes.set_ylabel('Count')
    return axes

position_t_plot(position_columns={'x'}, n=20, **kwargs)

Plots X and Y over T to visualize tracklength.

Parameters:

Name Type Description Default
position_columns set

containing names of position columns in data.

{'x'}
n int

number of samples to plot.

20
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - posCol (set): Deprecated. Use position_columns instead.

{}

Returns:

Name Type Description
fig Figure

Matplotlib figure object of density plot.

axes Axes

Matplotlib axes of density plot.

Source code in arcos4py/plotting/_plotting.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def position_t_plot(self, position_columns: set[str] = {'x'}, n: int = 20, **kwargs) -> Union[plt.Figure, Any]:
    """Plots X and Y over T to visualize tracklength.

    Arguments:
        position_columns (set): containing names of position columns in data.
        n (int): number of samples to plot.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - posCol (set): Deprecated. Use position_columns instead.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of density plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of density plot.
    """
    map_deprecated_params = {
        "posCol": "position_columns",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    position_columns = updated_kwargs.get("position_columns", position_columns)

    sample = pd.Series(self.data[self.obj_id].unique()).sample(n)
    pd_from_r_df = self.data.loc[self.data[self.obj_id].isin(sample)]
    fig, axes = plt.subplots(1, len(position_columns), figsize=(6, 3))
    for _, df in pd_from_r_df.groupby(self.obj_id):
        for index, value in enumerate(position_columns):
            if len(position_columns) > 1:
                df.plot(x=self.frame_column, y=value, ax=axes[index], legend=None)
            else:
                df.plot(x=self.frame_column, y=value, ax=axes, legend=None)
    if len(position_columns) > 1:
        for index, value in enumerate(position_columns):
            axes[index].set_title(value)
    else:
        axes.set_title(value)
    return fig, axes

plotOriginalDetrended(data, frame_column='frame', measurement_column='m', detrended_column='m_detrended', obj_id_column='obj_id', seed=42, **kwargs)

Plot original and detrended data.

Attributes:

Name Type Description
data DataFrame

containing ARCOS data.

frame_column str

name of frame column in data.

measurement_column str

name of measurement column in data.

detrended_column str

name of detrended column in data.

obj_id_column str

name of track id column.

seed int

seed for random number generator.

Methods:

Name Description
plot_detrended

plot detrended data.

plot_original

plot original data.

plot_original_and_detrended

plot original and detrended data.

Source code in arcos4py/plotting/_plotting.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def __init__(
    self,
    data: pd.DataFrame,
    frame_column: str = "frame",
    measurement_column: str = "m",
    detrended_column: str = "m_detrended",
    obj_id_column: str = "obj_id",
    seed: int = 42,
    **kwargs,
):
    """Constructs class with given parameters."""
    map_deprecated_params = {
        "id": "obj_id_column",
        "frame": "frame_column",
        "detrended": "detrended_column",
        "measurement": "measurement_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    frame_column = updated_kwargs.get("frame_column", frame_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)

    self.data = data
    self.frame_column = frame_column
    self.measurement_column = measurement_column
    self.detrended_column = detrended_column
    self.obj_id_column = obj_id_column
    self.seed = seed

plot_detrended(n_samples=25, subplots=(5, 5), plotsize=(20, 10), add_binary_segments=False)

Plots detrended data.

Parameters:

Name Type Description Default
n_samples int

number of samples to plot.

25
subplots tuple

number of subplots in x and y direction.

(5, 5)
plotsize tuple

size of the plot.

(20, 10)
add_binary_segments bool

if True, binary segments are added to the plot.

False

Returns:

Name Type Description
fig Figure

Matplotlib figure object of plot.

axes Axes

Matplotlib axes of plot.

Source code in arcos4py/plotting/_plotting.py
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
def plot_detrended(
    self,
    n_samples: int = 25,
    subplots: tuple = (5, 5),
    plotsize: tuple = (20, 10),
    add_binary_segments: bool = False,
) -> tuple[plt.Figure, Any]:
    """Plots detrended data.

    Arguments:
        n_samples (int): number of samples to plot.
        subplots (tuple): number of subplots in x and y direction.
        plotsize (tuple): size of the plot.
        add_binary_segments (bool): if True, binary segments are added to the plot.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of plot.
    """
    grouped = self._prepare_data(n_samples)
    return self._plot_data(
        grouped, subplots[0], subplots[1], plotsize, [self.detrended_column], ["detrended"], add_binary_segments
    )

plot_original(n_samples=25, subplots=(5, 5), plotsize=(20, 10), add_binary_segments=False)

Plots original data.

Parameters:

Name Type Description Default
n_samples int

number of samples to plot.

25
subplots tuple

number of subplots in x and y direction.

(5, 5)
plotsize tuple

size of the plot.

(20, 10)
add_binary_segments bool

if True, binary segments are added to the plot.

False

Returns:

Name Type Description
fig Figure

Matplotlib figure object of plot.

axes Axes

Matplotlib axes of plot.

Source code in arcos4py/plotting/_plotting.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def plot_original(
    self,
    n_samples: int = 25,
    subplots: tuple = (5, 5),
    plotsize: tuple = (20, 10),
    add_binary_segments: bool = False,
) -> tuple[plt.Figure, Any]:
    """Plots original data.

    Arguments:
        n_samples (int): number of samples to plot.
        subplots (tuple): number of subplots in x and y direction.
        plotsize (tuple): size of the plot.
        add_binary_segments (bool): if True, binary segments are added to the plot.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of plot.
    """
    grouped = self._prepare_data(n_samples)
    return self._plot_data(
        grouped,
        subplots[0],
        subplots[1],
        plotsize,
        [self.measurement_column],
        ["original"],
        add_binary_segments,
    )

plot_original_and_detrended(n_samples=25, subplots=(5, 5), plotsize=(20, 10), add_binary_segments=False)

Plots original and detrended data.

Parameters:

Name Type Description Default
n_samples int

number of samples to plot.

25
subplots tuple

number of subplots in x and y direction.

(5, 5)
plotsize tuple

size of the plot.

(20, 10)
add_binary_segments bool

if True, binary segments are added to the plot.

False

Returns:

Name Type Description
fig Figure

Matplotlib figure object of plot.

axes Axes

Matplotlib axes of plot.

Source code in arcos4py/plotting/_plotting.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def plot_original_and_detrended(
    self,
    n_samples: int = 25,
    subplots: tuple = (5, 5),
    plotsize: tuple = (20, 10),
    add_binary_segments: bool = False,
) -> tuple[plt.Figure, Any]:
    """Plots original and detrended data.

    Arguments:
        n_samples (int): number of samples to plot.
        subplots (tuple): number of subplots in x and y direction.
        plotsize (tuple): size of the plot.
        add_binary_segments (bool): if True, binary segments are added to the plot.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of plot.
    """
    grouped = self._prepare_data(n_samples)
    return self._plot_data(
        grouped,
        subplots[0],
        subplots[1],
        plotsize,
        [self.measurement_column, self.detrended_column],
        ["original", "detrended"],
        add_binary_segments,
    )

statsPlots(data)

Plot data generated by the stats module.

Attributes:

Name Type Description
data DataFrame

containing ARCOS stats data.

Parameters:

Name Type Description Default
data DataFrame

containing ARCOS stats data.

required
Source code in arcos4py/plotting/_plotting.py
420
421
422
423
424
425
426
def __init__(self, data: pd.DataFrame):
    """Plot detrended vs original data.

    Arguments:
        data (DataFrame): containing ARCOS stats data.
    """
    self.data = data

plot_events_duration(total_size, duration, point_size=40, *args, **kwargs)

Scatterplot of collective event duration.

Parameters:

Name Type Description Default
total_size str

name of total size column.

required
duration str

, name of column with collective event duration.

required
point_size int

scatterplot point size.

40
*args Any

Arguments passed on to seaborn scatterplot function.

()
**kwargs Any

Keyword arguments passed on to seaborn scatterplot function.

{}

Returns:

Name Type Description
Axes Axes

Matplotlib Axes object of scatterplot

Source code in arcos4py/plotting/_plotting.py
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
def plot_events_duration(self, total_size: str, duration: str, point_size: int = 40, *args, **kwargs) -> plt.Axes:
    """Scatterplot of collective event duration.

    Arguments:
        total_size (str): name of total size column.
        duration (str):, name of column with collective event duration.
        point_size (int): scatterplot point size.
        *args (Any): Arguments passed on to seaborn scatterplot function.
        **kwargs (Any): Keyword arguments passed on to seaborn scatterplot function.

    Returns:
        Axes (matplotlib.axes.Axes): Matplotlib Axes object of scatterplot
    """
    plot = sns.scatterplot(x=self.data[total_size], y=self.data[duration], s=point_size, *args, **kwargs)
    return plot

save_animation_frames(arcos_data, all_cells_data, output_dir, frame_col, collid_col, pos_cols, measurement_col=None, bin_col=None, plot_all_cells=True, color_all_cells_by_measurement=True, plot_bin_cells=True, plot_events=True, plot_convex_hulls=True, point_size=10.0, event_cmap=DEFAULT_EVENT_CMAP, event_alpha=0.9, hull_alpha=0.8, hull_linewidth_size_factor=1.0, bin_cell_color=DEFAULT_BIN_COLOR, bin_cell_alpha=0.7, bin_cell_marker_factor=0.8, all_cells_cmap=DEFAULT_ALL_CELLS_CMAP, all_cells_fixed_color=DEFAULT_ALL_CELLS_FIXED_COLOR, all_cells_alpha=0.5, all_cells_marker_size_factor=0.2, measurement_min_max=None, add_measurement_colorbar=True, filename_prefix='frame', dpi=150)

Generates and saves individual frames of a cell activity visualization as PNG images.

This function acts as a caller for the yield_animation_frames generator. It handles the iteration over frames, saving each frame to a file with appropriate naming and padding, and ensures figures are closed to free memory.

Parameters

arcos_data : pd.DataFrame DataFrame containing cell activity data, potentially including collective event IDs (collid_col) and binarization status (bin_col). all_cells_data : pd.DataFrame DataFrame containing all cells (or a representative background set) for background plotting. Must include frame_col, pos_cols, and measurement_col if color_all_cells_by_measurement is True. output_dir : str Directory where the output frames will be saved. frame_col : str Name of the column indicating the time frame. collid_col : str Name of the column indicating the collective event ID. Values > 0 are treated as events. pos_cols : List[str] List of column names for spatial coordinates (e.g., ['x', 'y'] or ['x', 'y', 'z']). measurement_col : Optional[str], optional Name of the column containing the measurement value. REQUIRED if color_all_cells_by_measurement is True. Used for coloring background cells. Default None. bin_col : Optional[str], optional Name of the column indicating binarized activity (e.g., values > 0 mean binarized). Used for plot_bin_cells. Default None. plot_all_cells : bool, optional Whether to plot the background cells from all_cells_data. Default True. color_all_cells_by_measurement : bool, optional If True and plot_all_cells is True, color background cells using measurement_col and all_cells_cmap. Requires measurement_col to be valid in all_cells_data. If False or requirements not met, uses all_cells_fixed_color. Default True. plot_bin_cells : bool, optional Whether to plot cells marked active by bin_col but not part of a collective event (collid_col <= 0). Requires bin_col to be set. Default True. plot_events : bool, optional Whether to plot cells identified as part of collective events (collid_col > 0). Default True. plot_convex_hulls : bool, optional Whether to draw convex hulls around collective events (2D only). Default True. point_size : float, optional Base size for plotted points (event cells). Default 10.0. event_cmap : str, optional Name of the Matplotlib colormap used to assign unique colors to different collective event IDs. Default 'tab20'. event_alpha : float, optional Alpha transparency for event cells. Default 0.9. hull_alpha : float, optional Alpha transparency for convex hull lines. Default 0.8. hull_linewidth_size_factor : float, optional Size factor for convex hull line width. Default 1.0. bin_cell_color : str, optional Color for binarized (non-event) cells. Default 'black'. bin_cell_alpha : float, optional Alpha transparency for binarized (non-event) cells. Default 0.7. bin_cell_marker_factor : float, optional Size multiplier for binarized (non-event) cells relative to point_size. Default 0.8. all_cells_cmap : str, optional Name of the Matplotlib colormap used for background cells when color_all_cells_by_measurement is True. Default 'viridis'. all_cells_fixed_color : str, optional Color for background cells if color_all_cells_by_measurement is False or requirements are not met. Default 'gray'. all_cells_alpha : float, optional Alpha transparency for background cells. Default 0.5. all_cells_marker_size_factor : float, optional Size multiplier for background cells relative to point_size. Default 0.2. measurement_min_max : Optional[Tuple[float, float]], optional Manual min/max values for the measurement colormap normalization. If None, the range is determined from all_cells_data[measurement_col]. Default None. add_measurement_colorbar : bool, optional If True and coloring all cells by measurement, add a static colorbar to the figure. Default True. filename_prefix : str, optional Prefix for the output filenames. Default 'frame'. dpi : int, optional DPI for the saved images. Default 150.

Source code in arcos4py/plotting/_plotting.py
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
def save_animation_frames(
    arcos_data: pd.DataFrame,
    all_cells_data: pd.DataFrame,
    output_dir: str,
    frame_col: str,
    collid_col: str,
    pos_cols: List[str],
    measurement_col: Optional[str] = None,
    bin_col: Optional[str] = None,
    plot_all_cells: bool = True,
    color_all_cells_by_measurement: bool = True,
    plot_bin_cells: bool = True,
    plot_events: bool = True,
    plot_convex_hulls: bool = True,
    point_size: float = 10.0,
    event_cmap: str = DEFAULT_EVENT_CMAP,
    event_alpha: float = 0.9,
    hull_alpha: float = 0.8,
    hull_linewidth_size_factor: float = 1.0,
    bin_cell_color: str = DEFAULT_BIN_COLOR,
    bin_cell_alpha: float = 0.7,
    bin_cell_marker_factor: float = 0.8,
    all_cells_cmap: str = DEFAULT_ALL_CELLS_CMAP,
    all_cells_fixed_color: str = DEFAULT_ALL_CELLS_FIXED_COLOR,
    all_cells_alpha: float = 0.5,
    all_cells_marker_size_factor: float = 0.2,
    measurement_min_max: Optional[Tuple[float, float]] = None,
    add_measurement_colorbar: bool = True,
    filename_prefix: str = "frame",
    dpi: int = 150,
) -> None:
    """Generates and saves individual frames of a cell activity visualization as PNG images.

    This function acts as a caller for the `yield_animation_frames` generator.
    It handles the iteration over frames, saving each frame to a file with
    appropriate naming and padding, and ensures figures are closed to free memory.

    Parameters
    ----------
    arcos_data : pd.DataFrame
        DataFrame containing cell activity data, potentially including collective
        event IDs (`collid_col`) and binarization status (`bin_col`).
    all_cells_data : pd.DataFrame
        DataFrame containing all cells (or a representative background set)
        for background plotting. Must include `frame_col`, `pos_cols`, and
        `measurement_col` if `color_all_cells_by_measurement` is True.
    output_dir : str
        Directory where the output frames will be saved.
    frame_col : str
        Name of the column indicating the time frame.
    collid_col : str
        Name of the column indicating the collective event ID.
        Values > 0 are treated as events.
    pos_cols : List[str]
        List of column names for spatial coordinates (e.g., ['x', 'y'] or ['x', 'y', 'z']).
    measurement_col : Optional[str], optional
        Name of the column containing the measurement value. REQUIRED if
        `color_all_cells_by_measurement` is True. Used for coloring background cells.
        Default None.
    bin_col : Optional[str], optional
        Name of the column indicating binarized activity (e.g., values > 0 mean
        binarized). Used for `plot_bin_cells`. Default None.
    plot_all_cells : bool, optional
        Whether to plot the background cells from `all_cells_data`. Default True.
    color_all_cells_by_measurement : bool, optional
        If True and `plot_all_cells` is True, color background cells using
        `measurement_col` and `all_cells_cmap`. Requires `measurement_col` to
        be valid in `all_cells_data`. If False or requirements not met, uses
        `all_cells_fixed_color`. Default True.
    plot_bin_cells : bool, optional
        Whether to plot cells marked active by `bin_col` but not part of a
        collective event (`collid_col` <= 0). Requires `bin_col` to be set.
        Default True.
    plot_events : bool, optional
        Whether to plot cells identified as part of collective events
        (`collid_col` > 0). Default True.
    plot_convex_hulls : bool, optional
        Whether to draw convex hulls around collective events (2D only).
        Default True.
    point_size : float, optional
        Base size for plotted points (event cells). Default 10.0.
    event_cmap : str, optional
        Name of the Matplotlib colormap used to assign unique colors to
        different collective event IDs. Default 'tab20'.
    event_alpha : float, optional
        Alpha transparency for event cells. Default 0.9.
    hull_alpha : float, optional
        Alpha transparency for convex hull lines. Default 0.8.
    hull_linewidth_size_factor : float, optional
        Size factor for convex hull line width. Default 1.0.
    bin_cell_color : str, optional
        Color for binarized (non-event) cells. Default 'black'.
    bin_cell_alpha : float, optional
        Alpha transparency for binarized (non-event) cells. Default 0.7.
    bin_cell_marker_factor : float, optional
        Size multiplier for binarized (non-event) cells relative to `point_size`.
        Default 0.8.
    all_cells_cmap : str, optional
        Name of the Matplotlib colormap used for background cells when
        `color_all_cells_by_measurement` is True. Default 'viridis'.
    all_cells_fixed_color : str, optional
        Color for background cells if `color_all_cells_by_measurement` is False
        or requirements are not met. Default 'gray'.
    all_cells_alpha : float, optional
        Alpha transparency for background cells. Default 0.5.
    all_cells_marker_size_factor : float, optional
        Size multiplier for background cells relative to `point_size`. Default 0.2.
    measurement_min_max : Optional[Tuple[float, float]], optional
        Manual min/max values for the measurement colormap normalization. If None,
        the range is determined from `all_cells_data[measurement_col]`. Default None.
    add_measurement_colorbar : bool, optional
        If True and coloring all cells by measurement, add a static colorbar
        to the figure. Default True.
    filename_prefix : str, optional
        Prefix for the output filenames. Default 'frame'.
    dpi : int, optional
        DPI for the saved images. Default 150.
    """
    # --- Setup Output Directory ---
    try:
        os.makedirs(output_dir, exist_ok=True)
        print(f"Saving animation frames to directory: {output_dir}")
    except OSError as e:
        print(f"Error creating output directory '{output_dir}': {e}")
        return  # Cannot proceed without output directory

    # --- Determine Frame Range and Padding (needed for filenames) ---
    # This duplicates a small part of the generator's logic, but is necessary
    # to format filenames correctly *before* the loop starts.
    min_frame_val = float("inf")
    max_frame_val = float("-inf")
    if not arcos_data.empty and frame_col in arcos_data:
        min_frame_val = 0
        max_frame_val = max(max_frame_val, arcos_data[frame_col].max())
    if not all_cells_data.empty and frame_col in all_cells_data:
        min_frame_val = 0
        max_frame_val = max(max_frame_val, all_cells_data[frame_col].max())

    if min_frame_val == float("inf") or max_frame_val == float("-inf"):
        # Generator will also warn, but we add a message here too.
        print("Could not determine frame range from input data. No frames will be saved.")
        return

    num_total_frames = int(max_frame_val) - int(min_frame_val) + 1
    padding_digits = (
        math.ceil(math.log10(max(1, int(max_frame_val)) + 1)) if max_frame_val >= 0 else 1
    )  # Calculate padding based on max frame number

    # --- Instantiate the Generator ---
    frame_generator = _yield_animation_frames(
        arcos_data=arcos_data,
        all_cells_data=all_cells_data,
        frame_col=frame_col,
        collid_col=collid_col,
        pos_cols=pos_cols,
        measurement_col=measurement_col,
        bin_col=bin_col,
        plot_all_cells=plot_all_cells,
        color_all_cells_by_measurement=color_all_cells_by_measurement,
        plot_bin_cells=plot_bin_cells,
        plot_events=plot_events,
        plot_convex_hulls=plot_convex_hulls,
        point_size=point_size,
        event_cmap=event_cmap,
        event_alpha=event_alpha,
        hull_alpha=hull_alpha,
        hull_linewidth_size_factor=hull_linewidth_size_factor,
        bin_cell_color=bin_cell_color,
        bin_cell_alpha=bin_cell_alpha,
        bin_cell_marker_factor=bin_cell_marker_factor,
        all_cells_cmap=all_cells_cmap,
        all_cells_fixed_color=all_cells_fixed_color,
        all_cells_alpha=all_cells_alpha,
        all_cells_marker_size_factor=all_cells_marker_size_factor,
        measurement_min_max=measurement_min_max,
        add_measurement_colorbar=add_measurement_colorbar,
    )

    # --- Iterate, Save, and Close ---
    saved_frame_count = 0
    print(f"Starting frame generation and saving (estimated {num_total_frames} frames)...")

    for fig in tqdm(frame_generator, desc="Saving frames", total=num_total_frames, unit="frame"):
        # Get frame number from the figure title (set by the generator)
        try:
            title = fig.axes[0].get_title()
            # Handle potential variations in title format slightly more robustly
            frame_num_str = title.split(':')[-1].strip()
            frame_num = int(frame_num_str)
        except (IndexError, ValueError, AttributeError) as e:
            warnings.warn(
                f"Could not reliably determine frame number from figure title ('{title}'). Using counter. Error: {e}"
            )
            # Fallback to a simple counter if title parsing fails
            frame_num = saved_frame_count + int(min_frame_val)  # Estimate frame num

        # Construct filename with padding
        frame_filename = f"{filename_prefix}_{frame_num:0{padding_digits}d}.png"
        output_path = os.path.join(output_dir, frame_filename)

        # Save the figure
        try:
            fig.savefig(output_path, dpi=dpi, bbox_inches='tight')
            saved_frame_count += 1
        except Exception as e:
            print(f"\nError saving frame {output_path}: {e}")
        finally:
            # CRITICAL: Close the figure to free memory, regardless of save success
            plt.close(fig)

    print(f"\nFinished saving {saved_frame_count} frames to {output_dir}.")
    if saved_frame_count == 0:
        print("Note: No frames were generated or saved. Check input data and parameters.")

tools

Tools for detecting collective events.

DataFrameTracker(linker, position_columns=['x'], frame_column='frame', obj_id_column=None, binarized_measurement_column=None, clid_column='clTrackID', **kwargs)

Bases: BaseTracker

Tracker class for data frames that works in conjunction with the Linker class.

Methods:

Name Description
track_iteration

pd.DataFrame): Tracks events in a single frame.

track

pd.DataFrame) -> Generator: Main method for tracking events through the dataframe. Yields the tracked data frame for each iteration.

Parameters:

Name Type Description Default
linker Linker

The Linker object used for linking events.

required
position_columns list[str]

List of strings representing the coordinate columns.

['x']
frame_column str

String representing the frame/timepoint column in the dataframe.

'frame'
obj_id_column str | None

String representing the ID column, or None if not present. Defaults to None.

None
binarized_measurement_column str | None

String representing the binary measurement column, or None if not present. Defaults to None.

None
clid_column str

String representing the collision track ID column. Defaults to 'clTrackID'.

'clTrackID'
kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - coordinates_column: Deprecated parameter for position_columns. Use position_columns instead. - collid_column: Deprecated parameter, use clid_column instead. - id_column: Deprecated parameter, use obj_id_column instead. - bin_meas_column: Deprecated parameter, use binarized_measurement_column instead.

{}
Source code in arcos4py/tools/_detect_events.py
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
def __init__(
    self,
    linker: Linker,
    position_columns: list[str] = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str | None = None,
    binarized_measurement_column: str | None = None,
    clid_column: str = 'clTrackID',
    **kwargs,
):
    """Initializes the DataFrameTracker object.

    Arguments:
        linker (Linker): The Linker object used for linking events.
        position_columns (list[str]): List of strings representing the coordinate columns.
        frame_column (str): String representing the frame/timepoint column in the dataframe.
        obj_id_column (str | None): String representing the ID column, or None if not present. Defaults to None.
        binarized_measurement_column (str | None): String representing the binary measurement column, or None if not present.
            Defaults to None.
        clid_column (str): String representing the collision track ID column. Defaults to 'clTrackID'.
        kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - coordinates_column: Deprecated parameter for position_columns. Use position_columns instead.
            - collid_column: Deprecated parameter, use clid_column instead.
            - id_column: Deprecated parameter, use obj_id_column instead.
            - bin_meas_column: Deprecated parameter, use binarized_measurement_column instead.
    """
    map_deprecated_params = {
        'coordinates_column': 'position_columns',
        'collid_column': 'clid_column',
        'id_column': 'obj_id_column',
        'bin_meas_column': 'binarized_measurement_column',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_deprecated_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    corrected_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assign parameters
    position_columns = corrected_kwargs.get('position_columns', position_columns)
    obj_id_column = corrected_kwargs.get('obj_id_column', obj_id_column)
    binarized_measurement_column = corrected_kwargs.get(
        'binarized_measurement_column', binarized_measurement_column
    )
    clid_column = corrected_kwargs.get('clid_column', clid_column)

    super().__init__(linker)
    self._coordinates_column = position_columns
    self._frame_column = frame_column
    self._id_column = obj_id_column
    self._binarized_measurement_column = binarized_measurement_column
    self._collid_column = clid_column
    self._validate_input(position_columns, frame_column, obj_id_column, binarized_measurement_column, clid_column)

track(x)

Main method for tracking events through the dataframe. Yields the tracked dataframe for each iteration.

Parameters:

Name Type Description Default
x DataFrame

Dataframe to track.

required

Yields:

Name Type Description
Generator Generator

Tracked dataframe.

Source code in arcos4py/tools/_detect_events.py
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
def track(self, x: pd.DataFrame) -> Generator:
    """Main method for tracking events through the dataframe. Yields the tracked dataframe for each iteration.

    Arguments:
        x (pd.DataFrame): Dataframe to track.

    Yields:
        Generator: Tracked dataframe.
    """
    if x.empty:
        raise ValueError('Input is empty')
    x_sorted = self._sort_input(x, frame_column=self._frame_column, object_id_column=self._id_column)

    for t in range(x_sorted[self._frame_column].min(), x_sorted[self._frame_column].max() + 1):
        x_frame = x_sorted.query(f'{self._frame_column} == {t}')
        x_tracked = self.track_iteration(x_frame)
        yield x_tracked

track_iteration(x)

Tracks events in a single frame. Returns dataframe with event ids.

Parameters:

Name Type Description Default
x DataFrame

Dataframe to track.

required

Returns:

Type Description
DataFrame

pd.DataFrame: Dataframe with event ids.

Source code in arcos4py/tools/_detect_events.py
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
def track_iteration(self, x: pd.DataFrame) -> pd.DataFrame:
    """Tracks events in a single frame. Returns dataframe with event ids.

    Arguments:
        x (pd.DataFrame): Dataframe to track.

    Returns:
        pd.DataFrame: Dataframe with event ids.
    """
    x_filtered = self._filter_active(x, self._binarized_measurement_column)

    coordinates_data = self._select_necessary_columns(
        x_filtered,
        self._coordinates_column,
    )
    self.linker.link(coordinates_data)

    if self._collid_column in x.columns:
        df_out = x_filtered.drop(columns=[self._collid_column]).copy()
    else:
        df_out = x_filtered.copy()
    event_ids = self.linker.event_ids

    if not event_ids.size:
        df_out[self._collid_column] = 0
        return df_out

    df_out[self._collid_column] = self.linker.event_ids
    if any([self.linker._allow_merges, self.linker._allow_splits]):
        df_out = self.linker.lineage_tracker._add_parents_and_lineage_to_df(
            df_out,
            self._collid_column,
        )
    return df_out

ImageTracker(linker, downsample=1)

Bases: BaseTracker

Tracker class for image data that works in conjunction with the Linker class.

Methods:

Name Description
track_iteration

np.ndarray): Tracks events in a single frame. Returns the tracked labels.

track

np.ndarray, dims: str = "TXY") -> Generator: Main method for tracking events through the image series. Yields the tracked image for each iteration.

Parameters:

Name Type Description Default
linker Linker

The Linker object used for linking events.

required
downsample int

Downsampling factor for the images. Defaults to 1, meaning no downsampling.

1
Source code in arcos4py/tools/_detect_events.py
1611
1612
1613
1614
1615
1616
1617
1618
1619
def __init__(self, linker: Linker, downsample: int = 1):
    """Initializes the ImageTracker object.

    Arguments:
        linker (Linker): The Linker object used for linking events.
        downsample (int): Downsampling factor for the images. Defaults to 1, meaning no downsampling.
    """
    super().__init__(linker)
    self._downsample = downsample

track(x, dims='TXY')

Method for tracking events through the image series. Yields the tracked image for each iteration.

Parameters:

Name Type Description Default
x ndarray

Image to track.

required
dims str

String of dimensions in order. Default is "TXY". Possible values are "T", "X", "Y", and "Z".

'TXY'

Returns:

Name Type Description
Generator Generator

Generator that yields the tracked image for each iteration.

Source code in arcos4py/tools/_detect_events.py
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
def track(self, x: np.ndarray, dims: str = "TXY") -> Generator:
    """Method for tracking events through the image series. Yields the tracked image for each iteration.

    Arguments:
        x (np.ndarray): Image to track.
        dims (str): String of dimensions in order. Default is "TXY". Possible values are "T", "X", "Y", and "Z".

    Returns:
        Generator: Generator that yields the tracked image for each iteration.
    """
    available_dims = ["T", "X", "Y", "Z"]
    dims_list = list(dims.upper())

    # check input
    for i in dims_list:
        if i not in dims_list:
            raise ValueError(f"Invalid dimension {i}. Must be 'T', 'X', 'Y', or 'Z'.")

    if len(dims_list) > len(set(dims_list)):
        raise ValueError("Duplicate dimensions in dims.")

    if len(dims_list) != x.ndim:
        raise ValueError(
            f"Length of dims must be equal to number of dimensions in image. Image has {x.ndim} dimensions."
        )

    dims_dict = {i: dims_list.index(i) for i in available_dims if i in dims_list}

    # reorder image so T is first dimension
    image_reshaped = np.moveaxis(x, dims_dict["T"], 0)

    for x_frame in image_reshaped:
        x_tracked = self.track_iteration(x_frame)
        yield x_tracked

track_iteration(x)

Tracks events in a single frame. Returns the tracked labels.

Parameters:

Name Type Description Default
x ndarray

Image to track.

required

Returns:

Type Description
ndarray

np.ndarray: Tracked labels.

Source code in arcos4py/tools/_detect_events.py
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
def track_iteration(self, x: np.ndarray) -> np.ndarray:
    """Tracks events in a single frame. Returns the tracked labels.

    Arguments:
        x (np.ndarray): Image to track.

    Returns:
        np.ndarray: Tracked labels.
    """
    x = downscale_image(x, self._downsample)
    coordinates_data, meas_data = self._image_to_coordinates(x)
    coordinates_data_filtered = self._filter_active(coordinates_data, meas_data)

    self.linker.link(coordinates_data_filtered)

    tracked_events = self.linker.event_ids
    out_img = self._coordinates_to_image(x, coordinates_data_filtered, tracked_events)

    if self._downsample > 1:
        out_img = upscale_image(out_img, self._downsample)

    return out_img

Linker(eps=1, eps_prev=None, min_clustersize=1, min_samples=None, clustering_method='dbscan', linking_method='nearest', predictor=True, n_prev=1, cost_threshold=0, reg=1, reg_m=10, n_jobs=1, allow_merges=False, allow_splits=False, stability_threshold=10, remove_small_clusters=False, min_size_for_split=1, **kwargs)

Linker class to link clusters across frames and detect collective events.

Attributes:

Name Type Description
event_ids ndarray

The event IDs.

frame_counter int

The current frame counter.

LineageTracker LineageTracker

The LineageTracker object.

Methods:

Name Description
link

Links clusters across frames and detects collective events.

get_event_ids

Returns the event IDs.

Parameters:

Name Type Description Default
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other.

1
eps_prev float | None

Frame to frame distance, value is used to connect collective events across multiple frames. If "None", same value as eps is used.

None
min_clustersize int

The minimum size for a cluster to be identified as a collective event.

1
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str | Callable

The clustering method to be used. One of ['dbscan', 'hdbscan'] or a callable that takes a 2d array of coordinates and returns a list of cluster labels. Arguments eps, minClSz and minSamples are ignored if a callable is passed.

'dbscan'
linking_method str

The linking method to be used.

'nearest'
predictor bool | Callable

The predictor method to be used.

True
n_prev int

Number of previous frames the tracking algorithm looks back to connect collective events.

1
n_jobs int

Number of jobs to run in parallel (only for clustering algorithm).

1
cost_threshold int

Threshold for filtering low-probability matches (only for transportation linking).

0
reg float

Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).

1
reg_m float

Marginal relaxation parameter for unbalanced OT (only for transportation linking).

10
stability_threshold int

Number of consecutive frames a merge/split must persist to be considered stable.

10
allow_merges bool

Whether to allow merges.

False
allow_splits bool

Whether to allow splits.

False
remove_small_clusters bool

Whether to remove clusters smaller than min_clustersize.

False
min_size_for_split int

The minimum size for a cluster to be considered for splitting. Multiple of min_clustersize.

1
kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.

{}
Source code in arcos4py/tools/_detect_events.py
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
def __init__(
    self,
    eps: float = 1,
    eps_prev: float | None = None,
    min_clustersize: int = 1,
    min_samples: int | None = None,
    clustering_method: str | Callable = "dbscan",
    linking_method: str = "nearest",
    predictor: bool | Callable = True,
    n_prev: int = 1,
    cost_threshold: float = 0,
    reg: float = 1,
    reg_m: float = 10,
    n_jobs: int = 1,
    allow_merges: bool = False,
    allow_splits: bool = False,
    stability_threshold: int = 10,
    remove_small_clusters: bool = False,
    min_size_for_split: int = 1,
    **kwargs,
):
    """Initializes the Linker object.

    Arguments:
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
        eps_prev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames. If "None", same value as eps is used.
        min_clustersize (int): The minimum size for a cluster to be identified as a collective event.
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str | Callable): The clustering method to be used. One of ['dbscan', 'hdbscan']
            or a callable that takes a 2d array of coordinates and returns a list of cluster labels.
            Arguments `eps`, `minClSz` and `minSamples` are ignored if a callable is passed.
        linking_method (str): The linking method to be used.
        predictor (bool | Callable): The predictor method to be used.
        n_prev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events.
        n_jobs (int): Number of jobs to run in parallel (only for clustering algorithm).
        cost_threshold (int): Threshold for filtering low-probability matches (only for transportation linking).
        reg (float): Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).
        reg_m (float): Marginal relaxation parameter for unbalanced OT (only for transportation linking).
        stability_threshold (int): Number of consecutive frames a merge/split must persist to be considered stable.
        allow_merges (bool): Whether to allow merges.
        allow_splits (bool): Whether to allow splits.
        remove_small_clusters (bool): Whether to remove clusters smaller than min_clustersize.
        min_size_for_split (int): The minimum size for a cluster to be considered for splitting. Multiple of min_clustersize.
        kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
    """
    map_params = {
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    self._predictor: Predictor | None  # for mypy
    self._memory = Memory(n_timepoints=n_prev)

    if callable(predictor):
        self._predictor = Predictor(predictor)
    elif predictor:
        self._predictor = Predictor.with_default_predictor()
    else:
        self._predictor = None

    self._nn_tree: KDTree | None = None
    if eps_prev is None:
        self._eps_prev = eps
    else:
        self._eps_prev = eps_prev

    self._reg = reg
    self._reg_m = reg_m
    self._cost_threshold = cost_threshold

    self._n_jobs = n_jobs
    self._validate_input(eps, eps_prev, min_clustersize, min_samples, clustering_method, n_prev, n_jobs)

    self.event_ids = np.empty((0, 0), dtype=np.int64)

    if hasattr(clustering_method, '__call__'):  # check if it's callable
        self.clustering_function = clustering_method
    else:
        if clustering_method == "dbscan":
            self.clustering_function = functools.partial(_dbscan, eps=eps, minClSz=min_clustersize)
        elif clustering_method == "hdbscan":
            self.clustering_function = functools.partial(
                _hdbscan, eps=eps, minClSz=min_clustersize, min_samples=min_samples, cluster_selection_method='eom'
            )
        else:
            raise ValueError(
                f'Clustering method must be either in {AVAILABLE_CLUSTERING_METHODS} or a callable with data as the only argument an argument'  # noqa E501
            )

    if hasattr(linking_method, '__call__'):  # check if it's callable
        self.linking_function = linking_method
    else:
        if linking_method == "nearest":
            self.linking_function = 'brute_force_linking'
        elif linking_method == "transportation":
            self.linking_function = 'transportation_linking'
        else:
            raise ValueError(
                f'Linking method must be either in {AVAILABLE_LINKING_METHODS} or a callable'  # noqa E501
            )

    self._stability_threshold = stability_threshold
    self._allow_merges = allow_merges
    self._allow_splits = allow_splits
    self._merge_candidate_history: Dict[int, List[Tuple[List[int], int]]] = {}
    self._split_candidate_history: Dict[int, List[Tuple[int, List[int]]]] = {}
    self.lineage_tracker = LineageTracker()
    self.frame_counter = -1  # Start at -1 to get the first frame to be 0
    self._remove_small_clusters = remove_small_clusters
    self._min_clustersize = min_clustersize
    self._min_size_for_split = min_size_for_split

Links clusters across frames and detects collective events.

Parameters:

Name Type Description Default
input_coordinates ndarray

The input coordinates.

required
Source code in arcos4py/tools/_detect_events.py
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
def link(self, input_coordinates: np.ndarray) -> None:
    """Links clusters across frames and detects collective events.

    Arguments:
        input_coordinates (np.ndarray): The input coordinates.
    """
    self.frame_counter += 1
    original_cluster_ids, coordinates, nanrows = self._clustering(input_coordinates)

    if not len(self._memory.prev_cluster_ids):
        linked_cluster_ids = self._update_id_empty(original_cluster_ids)
    elif original_cluster_ids.size == 0 or self._memory.all_cluster_ids.size == 0:
        linked_cluster_ids = self._update_id_empty(original_cluster_ids)
    else:
        linked_cluster_ids = self._update_id(original_cluster_ids, coordinates)

    if self._remove_small_clusters:
        final_cluster_ids = self._apply_remove_small_clusters(linked_cluster_ids, original_cluster_ids)

    # Apply stable merges and splits
    final_cluster_ids = self._apply_stable_merges_splits(linked_cluster_ids, original_cluster_ids)

    # Update lineage graph
    self.lineage_tracker._add_frame(linked_cluster_ids, final_cluster_ids, self.frame_counter)

    # Update memory and fit predictor
    self._memory.add_frame(new_coordinates=coordinates, new_cluster_ids=final_cluster_ids)
    if self._predictor is not None and len(self._memory.coordinates) > 1:
        self._predictor.fit(coordinates=self._memory.coordinates, cluster_ids=self._memory.prev_cluster_ids)
    self._memory.remove_timepoint()

    event_ids = np.full_like(nanrows, -1, dtype=np.int64)
    event_ids[~nanrows] = final_cluster_ids

    self.event_ids = event_ids

binData(smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', n_jobs=1, **kwargs)

Bases: detrender

Smooth, de-trend, and binarise the input data.

First a short-term median filter with size smoothK is applied to remove fast noise from the time series. If the de-trending method is set to "none", smoothing is applied on globally rescaled time series. The subsequent de-trending can be performed with a long-term median filter with the size biasK {biasMet = "runmed"} or by fitting a polynomial of degree polyDeg {biasMet = "lm"}.

After de-trending, if the global difference between min/max is greater than the threshold the signal is rescaled to the (0,1) range. The final signal is binarised using the binThr threshold.

Attributes:

Name Type Description
smoothK int

Size of the short-term median smoothing filter.

biasK int

Size of the long-term de-trending median filter.

peakThr float

Threshold for rescaling of the de-trended signal.

binThr float

Threshold for binarizing the de-trended signal.

polyDeg int

Sets the degree of the polynomial for lm fitting.

biasMet str

De-trending method, one of ['runmed', 'lm', 'none'].

Parameters:

Name Type Description Default
smooth_k int

Size of the short-term median smoothing filter.

3
bias_k int

Size of the long-term de-trending median filter.

51
peak_threshold float

Threshold for rescaling of the de-trended signal.

0.2
binarization_threshold float

Threshold for binarizing the de-trended signal.

0.1
polynomial_degree int

Sets the degree of the polynomial for lm fitting.

1
bias_method str

De-trending method, one of ['runmed', 'lm', 'none'].

'runmed'
n_jobs int

Number of jobs to run in parallel.

1
Source code in arcos4py/tools/_binarize_detrend.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def __init__(
    self,
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    n_jobs: int = 1,
    **kwargs,
) -> None:
    """Smooth, de-trend, and binarise the input data.

    Arguments:
        smooth_k (int): Size of the short-term median smoothing filter.
        bias_k (int): Size of the long-term de-trending median filter.
        peak_threshold (float): Threshold for rescaling of the de-trended signal.
        binarization_threshold (float): Threshold for binarizing the de-trended signal.
        polynomial_degree (int): Sets the degree of the polynomial for lm fitting.
        bias_method (str): De-trending method, one of ['runmed', 'lm', 'none'].
        n_jobs (int): Number of jobs to run in parallel.
    """
    super().__init__(smooth_k, bias_k, peak_threshold, polynomial_degree, bias_method, n_jobs, **kwargs)
    self.binarization_threshold = binarization_threshold

run(x, group_column, measurement_column, frame_column, **kwargs)

Runs binarization and detrending.

If the bias_method is 'none', first it rescales the data to between [0,1], then local smoothing is applied to the measurement by groups, followed by binarization.

If bias_method is one of ['lm', 'runmed'], first the data is detrended locally with a median filter and then detrended globally, for 'lm' with a linear model and for 'runmed' with a median filter. Followed by binarization of the data.

Parameters:

Name Type Description Default
x DataFrame

The time-series data for smoothing, detrending and binarization.

required
group_column str | None

Object id column in x. Detrending and rescaling is performed on a per-object basis. If None, no detrending is performed, only rescaling and bias method is ignored.

required
measurement_column str

Measurement column in x on which detrending and rescaling is performed.

required
frame_column str

Frame column in Time-series data. Used for sorting.

required
**kwargs Any

Additional keyword arguments. Includes old parameters for backwards compatibility. - GroupCol (str): Object id column in x. Detrending and rescaling is performed on a per-object basis. - colMeas (str): Measurement column in x on which detrending and rescaling is performed. - colFrame (str): Frame column in Time-series data. Used for sorting.

{}

Returns:

Name Type Description
DataFrame DataFrame

Dataframe containing binarized data, rescaled data and the original columns.

Source code in arcos4py/tools/_binarize_detrend.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def run(
    self, x: pd.DataFrame, group_column: str | None, measurement_column: str, frame_column: str, **kwargs
) -> pd.DataFrame:
    """Runs binarization and detrending.

    If the bias_method is 'none', first it rescales the data to between [0,1], then
    local smoothing is applied to the measurement by groups, followed by
    binarization.

    If bias_method is one of ['lm', 'runmed'], first the data is detrended locally with a
    median filter and then detrended globally, for 'lm' with a linear model and for 'runmed' with a
    median filter.
    Followed by binarization of the data.

    Arguments:
        x (DataFrame): The time-series data for smoothing, detrending and binarization.
        group_column (str | None): Object id column in x. Detrending and rescaling is performed on a per-object basis.
            If None, no detrending is performed, only rescaling and bias method is ignored.
        measurement_column (str): Measurement column in x on which detrending and rescaling is performed.
        frame_column (str): Frame column in Time-series data. Used for sorting.
        **kwargs (Any): Additional keyword arguments. Includes old parameters for backwards compatibility.
            - GroupCol (str): Object id column in x. Detrending and rescaling is performed on a per-object basis.
            - colMeas (str): Measurement column in x on which detrending and rescaling is performed.
            - colFrame (str): Frame column in Time-series data. Used for sorting.

    Returns:
        DataFrame: Dataframe containing binarized data, rescaled data and the original columns.
    """
    # handle deprecated parameters
    param_mapping = {
        "GroupCol": "group_column",
        "colMeas": "measurement_column",
        "colFrame": "frame_column",
    }
    # allowed_kwargs
    allowed_kwargs = param_mapping.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Invalid keyword argument: {key}")
    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    # update the parameters
    group_column = updated_kwargs.get("group_column", group_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    frame_column = updated_kwargs.get("frame_column", frame_column)

    if group_column is None:
        return self._run_without_groupcol(x, measurement_column, frame_column)
    else:
        return self._run_with_groupcol(x, group_column, measurement_column, frame_column)

calcCollevStats()

Class to calculate statistics of collective events.

Source code in arcos4py/tools/_stats.py
358
359
360
361
362
363
364
def __init__(self) -> None:
    """Initialize the class."""
    warnings.warn(
        "The 'calcCollevStats' class is deprecated and will be removed in a future version. "
        "Please use the standalone functions instead (calculate_statistics).",
        DeprecationWarning,
    )

calculate(data, frame_column, collid_column, obj_id_column, posCol=None)

Calculate summary statistics for collective events based on the entire duration of each event.

Parameters:

Name Type Description Default
data DataFrame

Input data containing information on the collective events.

required
frame_column str

The column name representing the frame numbers.

required
collid_column str

The column name representing the collective event IDs.

required
obj_id_column str

The column name representing the object IDs. Defaults to None.

required
posCol list

List of column names representing the position coordinates. Defaults to None.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

Deprecated
Source code in arcos4py/tools/_stats.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
def calculate(
    self,
    data: pd.DataFrame,
    frame_column: str,
    collid_column: str,
    obj_id_column: Union[str, None],
    posCol: Union[list, None] = None,
) -> pd.DataFrame:
    """Calculate summary statistics for collective events based on the entire duration of each event.

    Arguments:
        data (pd.DataFrame): Input data containing information on the collective events.
        frame_column (str): The column name representing the frame numbers.
        collid_column (str): The column name representing the collective event IDs.
        obj_id_column (str, optional): The column name representing the object IDs. Defaults to None.
        posCol (list, optional): List of column names representing the position coordinates. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

    Deprecated:
        calculate: Use calculate_statistics instead.
    """
    warnings.warn(
        "The 'calculate' method is deprecated and will be removed in a future version. "
        "Please use the 'calculate_statistics' function instead.",
        DeprecationWarning,
    )
    return calculate_statistics(data, frame_column, collid_column, obj_id_column, posCol)

clipMeas(data)

Clip input array.

Parameters:

Name Type Description Default
data ndarray

To be clipped.

required
Source code in arcos4py/tools/_cleandata.py
54
55
56
57
58
59
60
def __init__(self, data: np.ndarray) -> None:
    """Clips array to quantilles.

    Arguments:
        data (ndarray): To be clipped.
    """
    self.data = data

clip(clip_low=0.001, clip_high=0.999)

Clip input array to upper and lower quantiles defined in clip_low and clip_high.

Parameters:

Name Type Description Default
clip_low float

Lower clipping boundary (quantile).

0.001
clip_high float

Upper clipping boundry (quantille).

0.999

Returns:

Type Description
ndarray

np.ndarray (np.ndarray): A clipped array of the input data.

Source code in arcos4py/tools/_cleandata.py
80
81
82
83
84
85
86
87
88
89
90
91
92
def clip(self, clip_low: float = 0.001, clip_high: float = 0.999) -> np.ndarray:
    """Clip input array to upper and lower quantiles defined in clip_low and clip_high.

    Arguments:
        clip_low (float): Lower clipping boundary (quantile).
        clip_high (float): Upper clipping boundry (quantille).

    Returns:
        np.ndarray (np.ndarray): A clipped array of the input data.
    """
    low, high = self._calculate_percentile(self.data, clip_low, clip_high)
    out = self.data.clip(low, high)
    return out

detectCollev(input_data, eps=1, epsPrev=None, minClSz=1, nPrev=1, posCols=['x'], frame_column='time', id_column=None, bin_meas_column='meas', clid_column='clTrackID', dims='TXY', method='dbscan', min_samples=None, linkingMethod='nearest', n_jobs=1, predictor=False, show_progress=True)

Class to detect collective events.

Attributes:

Name Type Description
input_data Union[DataFrame, ndarray]

The input data to track.

eps float

Maximum distance for clustering, default is 1.

epsPrev Union[float, None]

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

minClSz int

Minimum cluster size. Default is 3.

nPrev int

Number of previous frames to consider. Default is 1.

posCols list

List of column names for the position columns. Default is ["x"].

frame_column str

Name of the column containing the frame number. Default is 'time'.

id_column Union[str, None]

Name of the column containing the id. Default is None.

bin_meas_column Union[str, None]

Name of the column containing the binary measurement. Default is 'meas'.

clid_column str

Name of the column containing the cluster id. Default is 'clTrackID'.

dims str

String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".

method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

linkingMethod str

The method used for linking. Default is 'nearest'.

n_jobs int

Number of jobs to run in parallel. Default is 1.

predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

show_progress bool

Whether or not to show progress bar. Default is True.

Parameters:

Name Type Description Default
input_data DataFrame

Input data to be processed. Must contain a binarized measurement column.

required
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other. This is not a maximum bound on the distances of points within a cluster.

1
epsPrev float | None

Frame to frame distance, value is used to connect collective events across multiple frames.If "None", same value as eps is used.

None
minClSz int

Minimum size for a cluster to be identified as a collective event.

1
nPrev int

Number of previous frames the tracking algorithm looks back to connect collective events.

1
posCols list

List of position columns contained in the data. Must at least contain one.

['x']
frame_column str

Indicating the frame column in input_data.

'time'
id_column str | None

Indicating the track id/id column in input_data, optional.

None
bin_meas_column str

Indicating the bin_meas_column in input_data or None.

'meas'
clid_column str

Indicating the column name containing the ids of collective events.

'clTrackID'
dims str

String of dimensions in order, used if input_data is a numpy array. Default is "TXY". Possible values are "T", "X", "Y", "Z".

'TXY'
method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
linkingMethod str

The method used for linking. Default is 'nearest'.

'nearest'
n_jobs int

Number of paralell workers to spawn, -1 uses all available cpus.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
show_progress bool

Whether or not to show progress bar. Default is True.

True
Source code in arcos4py/tools/_detect_events.py
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
def __init__(
    self,
    input_data: Union[pd.DataFrame, np.ndarray],
    eps: float = 1,
    epsPrev: Union[float, None] = None,
    minClSz: int = 1,
    nPrev: int = 1,
    posCols: list = ["x"],
    frame_column: str = 'time',
    id_column: Union[str, None] = None,
    bin_meas_column: Union[str, None] = 'meas',
    clid_column: str = 'clTrackID',
    dims: str = "TXY",
    method: str = "dbscan",
    min_samples: int | None = None,
    linkingMethod='nearest',
    n_jobs: int = 1,
    predictor: bool | Callable = False,
    show_progress: bool = True,
) -> None:
    """Constructs class with input parameters.

    Arguments:
        input_data (DataFrame): Input data to be processed. Must contain a binarized measurement column.
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
            This is not a maximum bound on the distances of points within a cluster.
        epsPrev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames.If "None", same value as eps is used.
        minClSz (int): Minimum size for a cluster to be identified as a collective event.
        nPrev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events.
        posCols (list): List of position columns contained in the data.
            Must at least contain one.
        frame_column (str): Indicating the frame column in input_data.
        id_column (str | None): Indicating the track id/id column in input_data, optional.
        bin_meas_column (str): Indicating the bin_meas_column in input_data or None.
        clid_column (str): Indicating the column name containing the ids of collective events.
        dims (str): String of dimensions in order, used if input_data is a numpy array. Default is "TXY".
            Possible values are "T", "X", "Y", "Z".
        method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        linkingMethod (str): The method used for linking. Default is 'nearest'.
        n_jobs (int): Number of paralell workers to spawn, -1 uses all available cpus.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        show_progress (bool): Whether or not to show progress bar. Default is True.
    """
    self.input_data = input_data
    self.eps = eps
    self.epsPrev = epsPrev
    self.minClSz = minClSz
    self.nPrev = nPrev
    self.posCols = posCols
    self.frame_column = frame_column
    self.id_column = id_column
    self.bin_meas_column = bin_meas_column
    self.clid_column = clid_column
    self.dims = dims
    self.method = method
    self.linkingMethod = linkingMethod
    self.min_samples = min_samples
    self.predictor = predictor
    self.n_jobs = n_jobs
    self.show_progress = show_progress
    warnings.warn(
        "This class is deprecated and will be removed a future release, use the track_events_dataframe or track_events_image functions directly.",  # noqa: E501
        DeprecationWarning,
    )

run(copy=True)

Runs the collective event detection algorithm.

Parameters:

Name Type Description Default
copy bool

Whether or not to copy the input data. Default is True.

True

Returns:

Name Type Description
DataFrame DataFrame

Input data with added collective event ids.

Source code in arcos4py/tools/_detect_events.py
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
def run(self, copy: bool = True) -> pd.DataFrame:
    """Runs the collective event detection algorithm.

    Arguments:
        copy (bool): Whether or not to copy the input data. Default is True.

    Returns:
        DataFrame: Input data with added collective event ids.
    """
    if isinstance(self.input_data, pd.DataFrame):
        if copy:
            self.input_data = self.input_data.copy()
        return track_events_dataframe(
            X=self.input_data,
            position_columns=self.posCols,
            frame_column=self.frame_column,
            id_column=self.id_column,
            binarized_measurement_column=self.bin_meas_column,
            clid_column=self.clid_column,
            eps=self.eps,
            eps_prev=self.epsPrev,
            min_clustersize=self.minClSz,
            min_samples=self.min_samples,
            clustering_method=self.method,
            linking_method=self.linkingMethod,
            n_prev=self.nPrev,
            predictor=self.predictor,
            n_jobs=self.n_jobs,
            show_progress=self.show_progress,
        )
    elif isinstance(self.input_data, np.ndarray):
        if copy:
            self.input_data = np.copy(self.input_data)
        return track_events_image(
            X=self.input_data,
            eps=self.eps,
            eps_prev=self.epsPrev,
            min_clustersize=self.minClSz,
            min_samples=self.min_samples,
            clustering_method=self.method,
            n_prev=self.nPrev,
            predictor=self.predictor,
            linking_method=self.linkingMethod,
            dims=self.dims,
            n_jobs=self.n_jobs,
            show_progress=self.show_progress,
        )

filterCollev(data, frame_column='time', clid_column='collid', obj_id_column='trackID', **kwargs)

Select Collective events that last longer than coll_duration and have a larger total size than coll_total_size.

Attributes:

Name Type Description
data Dataframe

With detected collective events.

frame_column str

Indicating the frame column in data.

collid_column str

Indicating the collective event id column in data.

obj_id_column str

Inidicating the object identifier column such as cell track id.

Parameters:

Name Type Description Default
data Dataframe

With detected collective events.

required
frame_column str

Indicating the frame column in data.

'time'
clid_column str

Indicating the collective event id column in data.

'collid'
obj_id_column str

Inidicating the object identifier column such as cell track id.

'trackID'
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - collid_column (str): Deprecated. Use clid_column instead.

{}
Source code in arcos4py/tools/_filter_events.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def __init__(
    self,
    data: pd.DataFrame,
    frame_column: str = "time",
    clid_column: str = "collid",
    obj_id_column: str = "trackID",
    **kwargs,
):
    """Constructs filterCollev class with Parameters.

    Arguments:
        data (Dataframe): With detected collective events.
        frame_column (str): Indicating the frame column in data.
        clid_column (str): Indicating the collective event id column in data.
        obj_id_column (str): Inidicating the object identifier column such as cell track id.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - collid_column (str): Deprecated. Use clid_column instead.
    """
    map_deprecated_params = {
        "collid_column": "clid_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.get("clid_column", clid_column)

    self.data = data
    self.frame_column = frame_column
    self.clid_column = clid_column
    self.obj_id_column = obj_id_column

filter(min_duration=9, min_total_size=10, **kwargs)

Filter collective events.

Method to filter collective events according to the parameters specified in the object instance.

Parameters:

Name Type Description Default
min_duration int

Minimal duration of collective events to be selected.

9
min_total_size int

Minimal total size of collective events to be selected.

10
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - coll_duration (int): Deprecated. Use min_duration instead. - coll_total_size (int): Deprecated. Use min_total_size instead.

{}

Returns:

Type Description
DataFrame

Returns pandas dataframe containing filtered collective events

Source code in arcos4py/tools/_filter_events.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def filter(self, min_duration: int = 9, min_total_size: int = 10, **kwargs) -> pd.DataFrame:
    """Filter collective events.

    Method to filter collective events according to the
    parameters specified in the object instance.

    Arguments:
        min_duration (int): Minimal duration of collective events to be selected.
        min_total_size (int): Minimal total size of collective events to be selected.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - coll_duration (int): Deprecated. Use min_duration instead.
            - coll_total_size (int): Deprecated. Use min_total_size instead.

    Returns:
         Returns pandas dataframe containing filtered collective events
    """
    map_deprecated_params = {
        "coll_duration": "min_duration",
        "coll_total_size": "min_total_size",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    min_duration = updated_kwargs.get("min_duration", min_duration)
    min_total_size = updated_kwargs.get("min_total_size", min_total_size)

    if self.data.empty:
        return self.data
    stats = calcCollevStats()
    stats_df = stats.calculate(self.data, self.frame_column, self.clid_column, self.obj_id_column)

    filtered_df = self._filter_collev(
        data=self.data,
        clid_stats=stats_df,
        clid_column=self.clid_column,
        min_duration=min_duration,
        min_total_size=min_total_size,
    )
    return filtered_df

interpolation(data)

Interpolate nan values in a numpy array.

Attributes:

Name Type Description
data DataFrame

Where NaN should be replaced with interpolated values.

Uses pandas.interpolate with liner interpolation.

Parameters:

Name Type Description Default
data DataFrame

Where NaN should be replaced with interpolated values.

required
Source code in arcos4py/tools/_cleandata.py
30
31
32
33
34
35
36
37
38
def __init__(self, data: pd.DataFrame):
    """Interpolate nan values in a pandas dataframe.

    Uses pandas.interpolate with liner interpolation.

    Arguments:
        data (DataFrame): Where NaN should be replaced with interpolated values.
    """
    self.data = data

interpolate()

Interpolate nan and missing values.

Returns:

Name Type Description
DataFrame DataFrame

Interpolated input data.

Source code in arcos4py/tools/_cleandata.py
40
41
42
43
44
45
46
47
48
def interpolate(self) -> pd.DataFrame:
    """Interpolate nan and missing values.

    Returns:
        DataFrame: Interpolated input data.
    """
    self.data = self.data.interpolate(axis=0)

    return self.data

calculate_statistics(data, frame_column='frame', clid_column='collid', obj_id_column=None, position_columns=None, **kwargs)

Calculate summary statistics for collective events based on the entire duration of each event.

Parameters:

Name Type Description Default
data DataFrame

Input data containing information on the collective events.

required
frame_column str

The column name representing the frame numbers.

'frame'
clid_column str

The column name representing the collective event IDs.

'collid'
obj_id_column str

The column name representing the object IDs. Defaults to None.

None
position_columns List[str]

List of column names representing the position coordinates. Defaults to None.

None
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - collid_column (str): Deprecated. Use clid_column instead. - pos_columns (List[str], optional): Deprecated. Use position_columns instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

Statistics Calculated
  • collid: The unique ID representing each collective event.
  • duration: The duration of each event, calculated as the difference between the maximum and minimum frame values plus one.
  • first_timepoint, last_timepoint: The first and last frames in which each event occurs.
  • total_size: The total number of unique objects involved in each event (calculated if obj_id_column is provided).
  • min_size, max_size: The minimum and maximum size of each event, defined as the number of objects in the event's smallest and largest frames, respectively.
  • first_frame_centroid_x, first_frame_centroid_y, last_frame_centroid_x, last_frame_centroid_y: The x and y coordinates of the centroid of all objects in the first and last frames of each event (calculated if posCol is provided).
  • centroid_speed: The speed of the centroid, calculated as the distance between the first and last frame centroids divided by the duration (calculated if posCol is provided).
  • direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided the change in x (calculated if posCol is provided).
  • first_frame_spatial_extent, last_frame_spatial_extent: The maximum distance between any pair of objects in the first and last frames (calculated if posCol is provided).
  • first_frame_convex_hull_area, last_frame_convex_hull_area: The areas of the convex hulls enclosing all objects in the first and last frames (calculated if posCol is provided).
  • size_variability: The standard deviation of the event size over all frames, providing a measure of the variability in the size of the event over time (calculated if obj_id_column is provided).
Source code in arcos4py/tools/_stats.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
def calculate_statistics(
    data: pd.DataFrame,
    frame_column: str = "frame",
    clid_column: str = "collid",
    obj_id_column: Union[str, None] = None,
    position_columns: Union[List[str], None] = None,
    **kwargs,
) -> pd.DataFrame:
    """Calculate summary statistics for collective events based on the entire duration of each event.

    Arguments:
        data (pd.DataFrame): Input data containing information on the collective events.
        frame_column (str): The column name representing the frame numbers.
        clid_column (str): The column name representing the collective event IDs.
        obj_id_column (str, optional): The column name representing the object IDs. Defaults to None.
        position_columns (List[str], optional): List of column names representing the position coordinates. Defaults to None.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - collid_column (str): Deprecated. Use clid_column instead.
            - pos_columns (List[str], optional): Deprecated. Use position_columns instead.

    Returns:
        pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

    Statistics Calculated:
        - collid: The unique ID representing each collective event.
        - duration: The duration of each event, calculated as the difference between the maximum
            and minimum frame values plus one.
        - first_timepoint, last_timepoint: The first and last frames in which each event occurs.
        - total_size: The total number of unique objects involved in each event
            (calculated if obj_id_column is provided).
        - min_size, max_size: The minimum and maximum size of each event,
            defined as the number of objects in the event's smallest and largest frames, respectively.
        - first_frame_centroid_x, first_frame_centroid_y, last_frame_centroid_x, last_frame_centroid_y:
            The x and y coordinates of the centroid of all objects in the first and last frames of each event
            (calculated if posCol is provided).
        - centroid_speed: The speed of the centroid, calculated as the distance between
            the first and last frame centroids divided by the duration (calculated if posCol is provided).
        - direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided
            the change in x (calculated if posCol is provided).
        - first_frame_spatial_extent, last_frame_spatial_extent: The maximum distance between any pair of objects in the
        first and last frames (calculated if posCol is provided).
        - first_frame_convex_hull_area, last_frame_convex_hull_area: The areas of the convex hulls enclosing all objects
            in the first and last frames (calculated if posCol is provided).
        - size_variability: The standard deviation of the event size over all frames, providing a measure of the
            variability in the size of the event over time (calculated if obj_id_column is provided).
    """
    map_deprecated_params = {
        "collid_column": "clid_column",
        "pos_columns": "position_columns",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.get("clid_column", clid_column)
    position_columns = updated_kwargs.get("position_columns", position_columns)

    # Error handling: Check if necessary columns are present in the input data
    if data.empty:
        return pd.DataFrame(
            columns=[clid_column, "duration", "first_timepoint", "last_timepoint"]
            + (["total_size", "min_size", "max_size", "size_variability"] if obj_id_column else [])
            + (
                [f'first_frame_centroid_{col}' for col in position_columns]
                + [f'last_frame_centroid_{col}' for col in position_columns]
                + ["centroid_speed", "direction"]
                + [f'{t}_spatial_extent' for t in ["first_frame", "last_frame"]]
                + [f'{t}_convex_hull_area' for t in ["first_frame", "last_frame"]]
                if position_columns
                else []
            )
        )

    necessary_columns = [frame_column, clid_column]
    if obj_id_column:
        necessary_columns.append(obj_id_column)
    if position_columns:
        necessary_columns.extend(position_columns)

    for col in necessary_columns:
        if col not in data.columns and col is not None:
            raise ValueError(f"The column '{col}' is not present in the input data.")

    collid_groups = data.groupby(clid_column)

    # Initialize an empty list to store the statistics
    stats_list = []

    for collid, group_data in collid_groups:

        collid_stats = {clid_column: collid}

        # Grouping by collid_column to get initial statistics
        duration = group_data[frame_column].max() - group_data[frame_column].min() + 1
        collid_stats['duration'] = duration
        collid_stats['first_timepoint'] = group_data[frame_column].min()
        collid_stats['last_timepoint'] = group_data[frame_column].max()

        # If obj_id_column is provided, calculate size related stats
        if obj_id_column:
            total_size = group_data[obj_id_column].nunique()

            collid_stats['total_size'] = total_size

        # calculate min and max size based on the number of objects in each frame
        frame_size_stats = group_data.groupby(frame_column).size()
        collid_stats['min_size'] = frame_size_stats.min()
        collid_stats['max_size'] = frame_size_stats.max()

        # If posCol is provided, calculate centroid coordinates for the
        if position_columns:
            tp_1 = collid_stats['first_timepoint']
            tp_2 = collid_stats['last_timepoint']

            centroid_data = group_data.groupby(frame_column)[position_columns].mean().reset_index()

            for col in position_columns:
                collid_stats[f'first_frame_centroid_{col}'] = centroid_data.query(f'{frame_column} == {tp_1}')[
                    col
                ].to_numpy()[0]
                collid_stats[f'last_frame_centroid_{col}'] = centroid_data.query(f'{frame_column} == {tp_2}')[
                    col
                ].to_numpy()[0]

            # Calculate speed and direction
            speed = np.linalg.norm(
                np.column_stack([collid_stats[f'first_frame_centroid_{col}'] for col in position_columns])
                - np.column_stack([collid_stats[f'last_frame_centroid_{col}'] for col in position_columns]),
                axis=1,
            ) / (collid_stats['duration'] - 1)

            collid_stats['centroid_speed'] = speed[0]

            # Direction For 2D data
            if len(position_columns) == 2:
                collid_stats['direction'] = np.arctan2(
                    collid_stats[f'last_frame_centroid_{position_columns[1]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[1]}'],
                    collid_stats[f'last_frame_centroid_{position_columns[0]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[0]}'],
                )
            # Direction For 3D data
            elif len(position_columns) == 3:
                dx = (
                    collid_stats[f'last_frame_centroid_{position_columns[0]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[0]}']
                )
                dy = (
                    collid_stats[f'last_frame_centroid_{position_columns[1]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[1]}']
                )
                dz = (
                    collid_stats[f'last_frame_centroid_{position_columns[2]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[2]}']
                )

                # Calculate azimuth and elevation
                collid_stats['azimuth'] = np.arctan2(dy, dx)
                collid_stats['elevation'] = np.arctan2(dz, np.sqrt(dx**2 + dy**2))
            else:
                raise ValueError("Position columns can only be 2 or 3.")

            # Loop over first and last frames separately to calculate the spatial extent and convex hull area
            for frame_name, frame_number in zip(['first_frame', 'last_frame'], [tp_1, tp_2]):
                # Get data for either the first or last frame
                frame_data = group_data.query(f'{frame_column} == {frame_number}')

                # Calculate spatial extent
                spatial_extent = pdist(frame_data[position_columns].values).max() if len(frame_data) > 1 else 0
                collid_stats[f'{frame_name}_spatial_extent'] = spatial_extent

                # Calculate convex hull area
                try:
                    convex_hull_area = (
                        ConvexHull(frame_data[position_columns].values).volume
                        if len(frame_data) > len(position_columns)
                        else 0
                    )
                except QhullError:
                    convex_hull_area = 0
                collid_stats[f'{frame_name}_convex_hull_area'] = convex_hull_area

        stats_list.append(collid_stats)

    # Create a DataFrame from the list of statistics
    stats_df = pd.DataFrame(stats_list)

    # Calculate size variability
    if obj_id_column:
        # Calculating size for each collid and frame
        frame_size_stats = data.groupby([clid_column, frame_column])[obj_id_column].nunique().reset_index(name='size')
        size_variability = frame_size_stats.groupby(clid_column)['size'].std().reset_index(name='size_variability')
        stats_df = stats_df.merge(size_variability, on=clid_column, how='left')

    return stats_df

calculate_statistics_per_frame(data, frame_column='frame', clid_column='collid', position_columns=None, **kwargs)

Calculate summary statistics for collective events based on the entire duration of each event.

Parameters:

Name Type Description Default
data DataFrame

Input data containing information on the collective events.

required
frame_column str

The column name representing the frame numbers.

'frame'
clid_column str

The column name representing the collective event IDs.

'collid'
position_columns List[str]

List of column names representing the position coordinates. Defaults to None.

None
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - collid_column (str): Deprecated. Use clid_column instead. - pos_columns (List[str], optional): Deprecated. Use position_columns instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

Statistics Calculated
  • collid: The unique ID representing each collective event.
  • frame: The frame number.
  • size: The number of objects in the collective event
  • centroid_x, centroid_y: The x and y coordinates of the centroid of all objects in the collective event (calculated if pos_columns is provided).
  • spatial_extent: The maximum distance between any pair of objects in the collective event (calculated if pos_columns is provided).
  • convex_hull_area: The area of the convex hull enclosing all objects in the collective event (calculated if pos_columns is provided).
  • direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided the change in x (calculated if pos_columns is provided).
  • centroid_speed: The speed of the centroid, calculated as the norm of the change in x and y divided by the duration (calculated if pos_columns is provided).
Source code in arcos4py/tools/_stats.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def calculate_statistics_per_frame(
    data: pd.DataFrame,
    frame_column: str = "frame",
    clid_column: str = "collid",
    position_columns: Union[List[str], None] = None,
    **kwargs,
) -> pd.DataFrame:
    """Calculate summary statistics for collective events based on the entire duration of each event.

    Arguments:
        data (pd.DataFrame): Input data containing information on the collective events.
        frame_column (str): The column name representing the frame numbers.
        clid_column (str): The column name representing the collective event IDs.
        position_columns (List[str], optional): List of column names representing the position coordinates. Defaults to None.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - collid_column (str): Deprecated. Use clid_column instead.
            - pos_columns (List[str], optional): Deprecated. Use position_columns instead.


    Returns:
        pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

    Statistics Calculated:
        - collid: The unique ID representing each collective event.
        - frame: The frame number.
        - size: The number of objects in the collective event
        - centroid_x, centroid_y: The x and y coordinates of the centroid of all objects in the collective event
            (calculated if pos_columns is provided).
        - spatial_extent: The maximum distance between any pair of objects in the collective event
            (calculated if pos_columns is provided).
        - convex_hull_area: The area of the convex hull enclosing all objects in the collective event
            (calculated if pos_columns is provided).
        - direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided
            the change in x (calculated if pos_columns is provided).
        - centroid_speed: The speed of the centroid, calculated as the norm of the change
            in x and y divided by the duration (calculated if pos_columns is provided).
    """
    map_deprecated_params = {
        "collid_column": "clid_column",
        "pos_columns": "position_columns",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.get("clid_column", clid_column)
    position_columns = updated_kwargs.get("position_columns", position_columns)

    necessary_columns = [frame_column, clid_column]
    if position_columns:
        necessary_columns.extend(position_columns)

    for col in necessary_columns:
        if col not in data.columns and col is not None:
            raise ValueError(f"The column '{col}' is not present in the input data.")

    if data.empty:
        return pd.DataFrame(
            columns=(
                [clid_column, frame_column, "size"]
                + [f"centroid_{col}" for col in position_columns]
                + ["spatial_extent", "convex_hull_area", "centroid_speed", "direction"]
                if position_columns
                else []
            )
        )

    collid_groups = data.groupby([frame_column, clid_column])
    stats_list = []

    for (frame, collid), group_data in collid_groups:

        frame_stats = {clid_column: collid, frame_column: frame}

        frame_stats['size'] = group_data.count()[frame_column]

        # If pos_columns are provided, calculate spatial statistics for this frame
        if position_columns:
            # Calculate centroid
            centroid = group_data[position_columns].mean().to_dict()
            for pos_col, cent_val in centroid.items():
                frame_stats[f'centroid_{pos_col}'] = cent_val

            # Calculate spatial extent
            spatial_extent = pdist(group_data[position_columns].values).max() if len(group_data) > 1 else 0
            frame_stats['spatial_extent'] = spatial_extent

            # Calculate convex hull area
            try:
                convex_hull_area = (
                    ConvexHull(group_data[position_columns].values).volume
                    if len(group_data) > len(position_columns)
                    else 0
                )
            except QhullError:
                convex_hull_area = 0
            frame_stats['convex_hull_area'] = convex_hull_area

        stats_list.append(frame_stats)

    # Create a DataFrame from the list of statistics
    stats_df = pd.DataFrame(stats_list)

    # If pos_columns are provided, we can calculate speed and direction by looking at changes between frames
    if position_columns:
        stats_df.sort_values(by=[clid_column, frame_column], inplace=True)

        for i, col in enumerate(position_columns):
            stats_df[f'delta_{col}'] = stats_df.groupby(clid_column)[f'centroid_{col}'].diff()

        # Calculate speed (the norm of the delta vector)
        stats_df['centroid_speed'] = np.linalg.norm(
            stats_df[[f'delta_{col}' for col in position_columns]].values, axis=1
        )

        # Calculate direction (only for 2D)
        if len(position_columns) == 2:
            stats_df['direction'] = np.arctan2(
                stats_df['delta_' + position_columns[1]], stats_df['delta_' + position_columns[0]]
            )

        # Clean up temporary delta columns
        stats_df.drop(columns=[f'delta_{col}' for col in position_columns], inplace=True)

    return stats_df

estimate_eps(data=None, image=None, method='kneepoint', position_columns=None, frame_column='t', n_neighbors=5, plot=True, plt_size=(5, 5), max_samples=50000, binarize_threshold=0, **kwargs)

Estimates eps parameter for DBSCAN using the k-distance graph method.

Works with either point data in a DataFrame or pixel data from an image/image series.

Parameters:

Name Type Description Default
data Optional[DataFrame]

DataFrame containing coordinates and frame info. Required if 'image' is None.

None
image Optional[ndarray]

Image array (2D) or time series (3D). Required if 'data' is None.

None
method str

Method for choosing eps from k-distances: 'kneepoint', 'mean', 'median'.

'kneepoint'
position_columns Optional[list[str]]

Column names for spatial coordinates in 'data'. Defaults to ['y', 'x'] for 2D images or ['y', 'x', 'z'] for 3D. Required if 'data' is provided.

None
frame_column str

Column name for frame/time in 'data'. Defaults to 't'.

't'
n_neighbors int

The 'k' for k-distance calculation (distance to k-th neighbor). Typically set to MinPts-1 for DBSCAN. Defaults to 5.

5
plot bool

If True, plots the sorted k-distance graph with the estimated eps.

True
plt_size tuple[int, int]

Figure size for the plot.

(5, 5)
max_samples int

Max number of k-distances to use for estimation (subsampling).

50000
binarize_threshold float

Threshold for converting 'image' pixels to points.

0
**kwargs Any

Additional keyword arguments passed to the estimation method. For 'kneepoint': S, online, curve, interp_method, direction, polynomial_degree. For 'mean'/'median': mean_multiplier, median_multiplier (defaults to 1.5).

{}

Returns:

Name Type Description
float float

Estimated eps value.

Raises:

Type Description
ValueError

If input requirements are not met (e.g., both/neither data/image given, missing columns, no valid distances found).

Source code in arcos4py/tools/_detect_events.py
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
def estimate_eps(  # noqa: C901
    data: Optional[pd.DataFrame] = None,
    image: Optional[np.ndarray] = None,
    method: str = "kneepoint",
    position_columns: Optional[list[str]] = None,
    frame_column: str = "t",
    n_neighbors: int = 5,
    plot: bool = True,
    plt_size: tuple[int, int] = (5, 5),
    max_samples: int = 50_000,
    binarize_threshold: float = 0,
    **kwargs: Any,
) -> float:
    """Estimates eps parameter for DBSCAN using the k-distance graph method.

    Works with either point data in a DataFrame or pixel data from an image/image series.

    Args:
        data (Optional[pd.DataFrame]): DataFrame containing coordinates and frame info.
                                       Required if 'image' is None.
        image (Optional[np.ndarray]): Image array (2D) or time series (3D).
                                    Required if 'data' is None.
        method (str): Method for choosing eps from k-distances: 'kneepoint', 'mean', 'median'.
        position_columns (Optional[list[str]]): Column names for spatial coordinates in 'data'.
                                                Defaults to ['y', 'x'] for 2D images or ['y', 'x', 'z'] for 3D.
                                                Required if 'data' is provided.
        frame_column (str): Column name for frame/time in 'data'. Defaults to 't'.
        n_neighbors (int): The 'k' for k-distance calculation (distance to k-th neighbor).
                           Typically set to MinPts-1 for DBSCAN. Defaults to 5.
        plot (bool): If True, plots the sorted k-distance graph with the estimated eps.
        plt_size (tuple[int, int]): Figure size for the plot.
        max_samples (int): Max number of k-distances to use for estimation (subsampling).
        binarize_threshold (float): Threshold for converting 'image' pixels to points.
        **kwargs (Any): Additional keyword arguments passed to the estimation method.
                         For 'kneepoint': S, online, curve, interp_method, direction, polynomial_degree.
                         For 'mean'/'median': mean_multiplier, median_multiplier (defaults to 1.5).

    Returns:
        float: Estimated eps value.

    Raises:
        ValueError: If input requirements are not met (e.g., both/neither data/image given,
                    missing columns, no valid distances found).
    """
    method_options = ["kneepoint", "mean", "median"]
    if method not in method_options:
        raise ValueError(f"Method must be one of {method_options}")

    if (data is None and image is None) or (data is not None and image is not None):
        raise ValueError("Provide either a DataFrame ('data') or an image ('image'), not both.")

    data_processed: Optional[pd.DataFrame] = None

    # --- Process Image Input ---
    if image is not None:
        ndim = image.ndim
        coords_list = []

        if ndim == 3:  # Time series (T, Y, X) or (T, Z, Y, X)? Assuming (T, Y, X) or (T, Z, Y, X)
            n_frames = image.shape[0]
            img_dims = image.shape[1:]  # Spatial dimensions
            if position_columns is None:
                # Default names based on spatial dimensions
                if len(img_dims) == 2:  # (Y, X)
                    position_columns = ['y', 'x']
                elif len(img_dims) == 3:  # (Z, Y, X)
                    position_columns = ['z', 'y', 'x']
                else:
                    raise ValueError(f"Unsupported image spatial dimensions: {len(img_dims)}")

            elif len(position_columns) != len(img_dims):
                raise ValueError(
                    f"Length of position_columns ({len(position_columns)}) must match image spatial dimensions ({len(img_dims)})."
                )

            print(f"Processing {n_frames} image frames with threshold {binarize_threshold}...")
            for t, img_frame in enumerate(image):
                frame_coords = _binarize_image_to_coords(img_frame, threshold=binarize_threshold)
                if frame_coords.size > 0:
                    # Add frame number as the first column
                    coords_with_frame = np.column_stack((np.full(frame_coords.shape[0], t), frame_coords))
                    coords_list.append(coords_with_frame)

        elif ndim == 2:  # Single image (Y, X) or (Z, Y, X)? Assuming (Y, X)
            img_dims = image.shape
            if position_columns is None:
                # Default names based on spatial dimensions
                if len(img_dims) == 2:  # (Y, X)
                    position_columns = ['y', 'x']
                # Add handling for single 3D image if needed
                # elif len(img_dims) == 3: # (Z, Y, X)
                #    position_columns = ['z', 'y', 'x']
                else:
                    raise ValueError(f"Unsupported image spatial dimensions: {len(img_dims)}")
            elif len(position_columns) != len(img_dims):
                raise ValueError(
                    f"Length of position_columns ({len(position_columns)}) must match image spatial dimensions ({len(img_dims)})."
                )

            print(f"Processing single image frame with threshold {binarize_threshold}...")
            frame_coords = _binarize_image_to_coords(image, threshold=binarize_threshold)
            if frame_coords.size > 0:
                # Add a dummy frame number (0) as the first column
                coords_with_frame = np.column_stack((np.zeros(frame_coords.shape[0]), frame_coords))
                coords_list.append(coords_with_frame)
        else:
            raise ValueError(f"Unsupported image dimension: {ndim}. Expecting 2 or 3.")

        if not coords_list:
            raise ValueError(f"No coordinates found in image data after applying threshold {binarize_threshold}.")

        # Combine coordinates from all frames
        all_coords_np = np.vstack(coords_list)
        data_processed = pd.DataFrame(all_coords_np, columns=[frame_column] + position_columns)

    # --- Process DataFrame Input ---
    elif data is not None:
        if position_columns is None:
            raise ValueError("`position_columns` must be provided when input is a DataFrame.")
        # Validate DataFrame structure
        required_cols = [frame_column] + position_columns
        missing_cols = [col for col in required_cols if col not in data.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns in DataFrame: {missing_cols}")
        data_processed = data[required_cols].copy()  # Work on a copy

    # Should have data_processed DataFrame by now
    if data_processed is None or data_processed.empty:
        raise ValueError("Could not process input data into a valid format.")

    # --- Calculate k-Distances ---
    # Convert relevant columns to numpy for efficiency
    if position_columns is None:
        raise ValueError("position_columns must be provided.")
    data_np = data_processed[[frame_column] + position_columns].to_numpy(dtype=np.float64)

    # Sort by frame first, essential for correct splitting
    data_np = data_np[data_np[:, 0].argsort()]

    # Get unique frame numbers and the indices where each new frame starts
    unique_frames, frame_start_indices = np.unique(data_np[:, 0], return_index=True)

    grouped_coords = np.split(data_np[:, 1:], frame_start_indices[1:])

    print(f"Calculating {n_neighbors}-th neighbor distances for {len(grouped_coords)} frames...")
    all_k_distances = []
    for i, frame_coords in enumerate(grouped_coords):
        if frame_coords.shape[0] > n_neighbors:
            k_distances = _get_kth_neighbor_distance(frame_coords, k=n_neighbors)
            if k_distances.size > 0:
                all_k_distances.append(k_distances)

    if not all_k_distances:
        raise ValueError(f"No frames found with enough points (> {n_neighbors}) to calculate k-th neighbor distances.")

    # Combine distances from all valid frames
    distances_array = np.concatenate(all_k_distances)

    # Remove any non-finite values (though less likely with KDTree distances)
    distances_finite = distances_array[np.isfinite(distances_array)]

    if distances_finite.shape[0] == 0:
        raise ValueError("No valid finite k-th neighbor distances found.")

    # Subsample if necessary
    n_total_distances = distances_finite.shape[0]
    if n_total_distances > max_samples:
        print(f"Subsampling {max_samples} distances from {n_total_distances} for estimation.")
        distances_sampled = np.random.choice(distances_finite, max_samples, replace=False)
    else:
        distances_sampled = distances_finite

    # Sort the distances for analysis and plotting
    distances_sorted = np.sort(distances_sampled)

    # --- Estimate eps ---
    eps: float = 0.0  # Initialize eps

    print(f"Estimating eps using '{method}' method...")
    if method == "kneepoint":
        kneedle_kwargs = {
            'S': kwargs.get("S", 1.0),
            'online': kwargs.get("online", False),
            'curve': kwargs.get("curve", "convex"),
            'direction': kwargs.get("direction", "increasing"),
            'interp_method': kwargs.get("interp_method", "interp1d"),
            'polynomial_degree': kwargs.get("polynomial_degree", 7),
        }
        # Filter out None values potentially returned by kwargs.get if default was None
        kneedle_kwargs = {k: v for k, v in kneedle_kwargs.items() if v is not None}

        kneedle = KneeLocator(x=np.arange(distances_sorted.shape[0]), y=distances_sorted, **kneedle_kwargs)
        if kneedle.knee is None:
            warnings.warn("Kneepoint detection failed. Falling back to median distance as eps.")
            # Fallback strategy: use median * 1.0 (no multiplier)
            eps = np.median(distances_sorted)
        else:
            eps = float(distances_sorted[kneedle.knee])  # Ensure float type

    elif method == "mean":
        multiplier = kwargs.get("mean_multiplier", 1.5)
        eps = np.mean(distances_sorted) * multiplier

    elif method == "median":
        multiplier = kwargs.get("median_multiplier", 1.5)
        eps = np.median(distances_sorted) * multiplier

    print(f"Estimated eps: {eps:.4f}")

    # --- Plotting ---
    if plot:
        fig, ax = plt.subplots(figsize=plt_size)
        ax.plot(distances_sorted, marker='.', linestyle='-', markersize=2, label=f'{n_neighbors}-th Neighbor Distance')
        ax.axhline(eps, color="r", linestyle="--", label=f'Estimated eps = {eps:.4f}')

        # Annotate kneepoint if found
        if method == "kneepoint" and 'kneedle' in locals() and kneedle.knee is not None:
            ax.plot(kneedle.knee, distances_sorted[kneedle.knee], 'ro', markersize=6, label='Detected Knee')

        ax.set_xlabel("Points Sorted by Distance")
        ax.set_ylabel(f"Distance to {n_neighbors}-th Nearest Neighbor")
        ax.set_title("k-Distance Graph for eps Estimation")
        ax.legend()
        ax.grid(True, linestyle=':', alpha=0.6)
        plt.tight_layout()
        plt.show()

    return eps

remove_image_background(image, filter_type='gaussian', size=(10, 1, 1), dims='TXY', crop_time_axis=False)

Removes background from images. Assumes axis order (t, y, x) for 2d images and (t, z, y, x) for 3d images.

Parameters:

Name Type Description Default
image ndarray

Image to remove background from.

required
filter_type Union[str, function]

Filter to use to remove background. Can be one of ['median', 'gaussian'].

'gaussian'
size (int, Tuple)

Size of filter to use. For median filter, this is the size of the window. For gaussian filter, this is the standard deviation. If a single int is passed in, it is assumed to be the same for all dimensions. If a tuple is passed in, it is assumed to correspond to the size of the filter in each dimension. Default is (10, 1, 1).

(10, 1, 1)
dims str

Dimensions to apply filter over. Can be one of ['TXY', 'TZXY']. Default is 'TXY'.

'TXY'
crop_time_axis bool

Whether to crop the time axis. Default is True.

False

Returns (np.ndarray): Image with background removed. Along the first axis (t) half of the filter size is removed from the beginning and end respectively.

Source code in arcos4py/tools/_cleandata.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def remove_image_background(
    image: np.ndarray, filter_type: str = 'gaussian', size=(10, 1, 1), dims="TXY", crop_time_axis: bool = False
) -> np.ndarray:
    """Removes background from images. Assumes axis order (t, y, x) for 2d images and (t, z, y, x) for 3d images.

    Arguments:
        image (np.ndarray): Image to remove background from.
        filter_type (Union[str, function]): Filter to use to remove background. Can be one of ['median', 'gaussian'].
        size (int, Tuple): Size of filter to use. For median filter, this is the size of the window.
            For gaussian filter, this is the standard deviation.
            If a single int is passed in, it is assumed to be the same for all dimensions.
            If a tuple is passed in, it is assumed to correspond to the size of the filter in each dimension.
            Default is (10, 1, 1).
        dims (str): Dimensions to apply filter over. Can be one of ['TXY', 'TZXY']. Default is 'TXY'.
        crop_time_axis (bool): Whether to crop the time axis. Default is True.
    Returns (np.ndarray): Image with background removed.
        Along the first axis (t) half of the filter size is removed from the beginning and end respectively.
    """
    # correct images with a filter applied over time
    allowed_filters = ["median", "gaussian"]
    dims_list = list(dims.upper())

    # check input
    for i in dims_list:
        if i not in dims_list:
            raise ValueError(f"Invalid dimension {i}. Must be 'T', 'X', 'Y', or 'Z'.")

    if len(dims_list) > len(set(dims_list)):
        raise ValueError("Duplicate dimensions in dims.")

    if len(dims_list) != image.ndim:
        raise ValueError(
            f"Length of dims must be equal to number of dimensions in image. Image has {image.ndim} dimensions."
        )
    # make sure axis dont occur twice and that they are valid
    if len(dims) != len(set(dims)):
        raise ValueError('Dimensions must not occur twice.')

    if filter_type not in allowed_filters:
        raise ValueError(f'Filter type must be one of {allowed_filters}.')

    # get index of time axis
    t_idx = dims_list.index("T")

    orig_image = image.copy()

    if isinstance(size, int):
        size = (size,) * image.ndim
    elif isinstance(size, tuple):
        if len(size) != image.ndim:
            raise ValueError(f'Filter size must have {image.ndim} dimensions.')
        # check size of dimensions are compatible with image
        for idx, s in enumerate(size):
            if s > image.shape[idx]:
                raise ValueError(f'Filter size in dimension {idx} is larger than image size in that dimension.')
    else:
        raise ValueError('Filter size must be an int or tuple.')

    if filter_type == 'median':
        filtered = median_filter(orig_image, size=size)
    elif filter_type == 'gaussian':
        filtered = gaussian_filter(orig_image, sigma=size)

    # crop time axis if necessary
    shift = size[t_idx] // 2
    corr = np.subtract(orig_image, filtered, dtype=np.float32)
    if crop_time_axis:
        corr = corr[shift:-shift]

    return corr

track_events_dataframe(X, position_columns, frame_column, id_column=None, binarized_measurement_column=None, clid_column='collid', eps=1.0, eps_prev=None, min_clustersize=3, min_samples=None, clustering_method='dbscan', linking_method='nearest', allow_merges=False, allow_splits=False, stability_threshold=10, remove_small_clusters=False, min_size_for_split=1, reg=1, reg_m=10, cost_threshold=0, n_prev=1, predictor=False, n_jobs=1, show_progress=True, **kwargs)

Function to track collective events in a dataframe.

Parameters:

Name Type Description Default
X DataFrame

The input dataframe containing the data to track.

required
position_columns List[str]

The names of the columns representing coordinates.

required
frame_column str

The name of the column containing frame ids.

required
id_column str | None

The name of the column representing IDs. None if no such column.

None
binarized_measurement_column str | None

The name of the column representing binarized measurements, if None all measurements are used.

None
clid_column str

The name of the output column representing collective events, will be generated.

'collid'
eps float

Maximum distance for clustering, default is 1.

1.0
eps_prev float | None

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

None
min_clustersize int

Minimum cluster size. Default is 3.

3
min_samples int

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
linking_method str

The method used for linking, one of ['nearest', 'transportsolver']. Default is 'nearest'.

'nearest'
allow_merges bool

Whether or not to allow merges. Default is False.

False
allow_splits bool

Whether or not to allow splits. Default is False.

False
stability_threshold int

Number of frames to consider for stability. Default is 10.

10
remove_small_clusters bool

Whether or not to remove small clusters. Default is False.

False
min_size_for_split int

Minimum size for a split. Default is 1.

1
reg float

Regularization parameter for transportation solver. Default is 1.

1
reg_m float

Regularization parameter for transportation solver. Default is 10.

10
cost_threshold float

Cost threshold for transportation solver. Default is 0.

0
n_prev int

Number of previous frames to consider. Default is 1.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
n_jobs int

Number of jobs to run in parallel. Default is 1.

1
show_progress bool

Whether or not to show progress bar. Default is True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead. - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: Dataframe with tracked events.

Source code in arcos4py/tools/_detect_events.py
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
def track_events_dataframe(
    X: pd.DataFrame,
    position_columns: List[str],
    frame_column: str,
    id_column: str | None = None,
    binarized_measurement_column: str | None = None,
    clid_column: str = "collid",
    eps: float = 1.0,
    eps_prev: float | None = None,
    min_clustersize: int = 3,
    min_samples: int | None = None,
    clustering_method: str = "dbscan",
    linking_method: str = 'nearest',
    allow_merges: bool = False,
    allow_splits: bool = False,
    stability_threshold: int = 10,
    remove_small_clusters: bool = False,
    min_size_for_split: int = 1,
    reg: float = 1,
    reg_m: float = 10,
    cost_threshold: float = 0,
    n_prev: int = 1,
    predictor: bool | Callable = False,
    n_jobs: int = 1,
    show_progress: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """Function to track collective events in a dataframe.

    Arguments:
        X (pd.DataFrame): The input dataframe containing the data to track.
        position_columns (List[str]): The names of the columns representing coordinates.
        frame_column (str): The name of the column containing frame ids.
        id_column (str | None): The name of the column representing IDs. None if no such column.
        binarized_measurement_column (str | None): The name of the column representing binarized measurements,
            if None all measurements are used.
        clid_column (str): The name of the output column representing collective events, will be generated.
        eps (float): Maximum distance for clustering, default is 1.
        eps_prev (float | None): Maximum distance for linking previous clusters, if None, eps is used. Default is None.
        min_clustersize (int): Minimum cluster size. Default is 3.
        min_samples (int): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        linking_method (str): The method used for linking, one of ['nearest', 'transportsolver']. Default is 'nearest'.
        allow_merges (bool): Whether or not to allow merges. Default is False.
        allow_splits (bool): Whether or not to allow splits. Default is False.
        stability_threshold (int): Number of frames to consider for stability. Default is 10.
        remove_small_clusters (bool): Whether or not to remove small clusters. Default is False.
        min_size_for_split (int): Minimum size for a split. Default is 1.
        reg (float): Regularization parameter for transportation solver. Default is 1.
        reg_m (float): Regularization parameter for transportation solver. Default is 10.
        cost_threshold (float): Cost threshold for transportation solver. Default is 0.
        n_prev (int): Number of previous frames to consider. Default is 1.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        n_jobs (int): Number of jobs to run in parallel. Default is 1.
        show_progress (bool): Whether or not to show progress bar. Default is True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
            - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

    Returns:
        pd.DataFrame: Dataframe with tracked events.
    """
    map_params = {
        "coordinates_column": "position_columns",
        "bin_meas_column": "binarized_measurement_column",
        "collid_column": "clid_column",
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
        'showProgress': 'show_progress',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    linking_method = kwargs.get('linking_method', linking_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    linker = Linker(
        eps=eps,
        eps_prev=eps_prev,
        min_clustersize=min_clustersize,
        min_samples=min_samples,
        clustering_method=clustering_method,
        linking_method=linking_method,
        n_prev=n_prev,
        predictor=predictor,
        n_jobs=n_jobs,
        allow_merges=allow_merges,
        allow_splits=allow_splits,
        stability_threshold=stability_threshold,
        remove_small_clusters=remove_small_clusters,
        min_size_for_split=min_size_for_split,
        reg=reg,
        reg_m=reg_m,
        cost_threshold=cost_threshold,
    )

    tracker = DataFrameTracker(
        linker=linker,
        position_columns=position_columns,
        frame_column=frame_column,
        obj_id_column=id_column,
        binarized_measurement_column=binarized_measurement_column,
        clid_column=clid_column,
    )
    df_out = pd.concat(
        [timepoint for timepoint in tqdm(tracker.track(X), total=X[frame_column].nunique(), disable=not show_progress)]
    ).reset_index(drop=True)

    if any([allow_merges, allow_splits]):
        return df_out.query(f"{clid_column} != -1").reset_index(drop=True), linker.lineage_tracker
    return df_out.query(f"{clid_column} != -1").reset_index(drop=True)

track_events_image(X, eps=1, eps_prev=None, min_clustersize=1, min_samples=None, clustering_method='dbscan', n_prev=1, predictor=False, linking_method='nearest', allow_merges=False, allow_splits=False, stability_threshold=10, remove_small_clusters=False, min_size_for_split=1, reg=1, reg_m=10, cost_threshold=0, dims='TXY', downsample=1, n_jobs=1, show_progress=True, **kwargs)

Function to track events in an image using specified linking and clustering methods.

Parameters:

Name Type Description Default
X ndarray

The input array containing the images to track.

required
eps float

Distance for clustering. Default is 1.

1
eps_prev float | None

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

None
min_clustersize int

Minimum cluster size. Default is 1.

1
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
n_prev int

Number of previous frames to consider. Default is 1.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
linking_method str

The method used for linking. Default is 'nearest'.

'nearest'
allow_merges bool

Whether or not to allow merges. Default is False.

False
allow_splits bool

Whether or not to allow splits. Default is False.

False
stability_threshold int

The number of frames required for a stable merge or split. Default is 10.

10
remove_small_clusters bool

Whether or not to remove small clusters. Default is False.

False
min_size_for_split int

Minimum size for a split. Default is 1.

1
reg float

Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).

1
reg_m float

Marginal relaxation parameter for unbalanced OT (only for transportation linking).

10
cost_threshold float

Threshold for filtering low-probability matches (only for transportation linking).

0
dims str

String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".

'TXY'
downsample int

Factor by which to downsample the image. Default is 1.

1
n_jobs int

Number of jobs to run in parallel. Default is 1.

1
show_progress bool

Whether or not to show progress bar. Default is True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead. - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

{}

Returns:

Type Description
ndarray | tuple[ndarray, LineageTracker]

np.ndarray: Array of images with tracked events.

Source code in arcos4py/tools/_detect_events.py
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
def track_events_image(
    X: np.ndarray,
    eps: float = 1,
    eps_prev: float | None = None,
    min_clustersize: int = 1,
    min_samples: int | None = None,
    clustering_method: str = "dbscan",
    n_prev: int = 1,
    predictor: bool | Callable = False,
    linking_method: str = 'nearest',
    allow_merges: bool = False,
    allow_splits: bool = False,
    stability_threshold: int = 10,
    remove_small_clusters: bool = False,
    min_size_for_split: int = 1,
    reg: float = 1,
    reg_m: float = 10,
    cost_threshold: float = 0,
    dims: str = "TXY",
    downsample: int = 1,
    n_jobs: int = 1,
    show_progress: bool = True,
    **kwargs,
) -> np.ndarray | tuple[np.ndarray, LineageTracker]:
    """Function to track events in an image using specified linking and clustering methods.

    Arguments:
        X (np.ndarray): The input array containing the images to track.
        eps (float): Distance for clustering. Default is 1.
        eps_prev (float | None): Maximum distance for linking previous clusters, if None, eps is used. Default is None.
        min_clustersize (int): Minimum cluster size. Default is 1.
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        n_prev (int): Number of previous frames to consider. Default is 1.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        linking_method (str): The method used for linking. Default is 'nearest'.
        allow_merges (bool): Whether or not to allow merges. Default is False.
        allow_splits (bool): Whether or not to allow splits. Default is False.
        stability_threshold (int): The number of frames required for a stable merge or split. Default is 10.
        remove_small_clusters (bool): Whether or not to remove small clusters. Default is False.
        min_size_for_split (int): Minimum size for a split. Default is 1.
        reg (float): Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).
        reg_m (float): Marginal relaxation parameter for unbalanced OT (only for transportation linking).
        cost_threshold (float): Threshold for filtering low-probability matches (only for transportation linking).
        dims (str): String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".
        downsample (int): Factor by which to downsample the image. Default is 1.
        n_jobs (int): Number of jobs to run in parallel. Default is 1.
        show_progress (bool): Whether or not to show progress bar. Default is True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
            - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

    Returns:
        np.ndarray: Array of images with tracked events.
    """
    map_params = {
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
        'showProgress': 'show_progress',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    linking_method = kwargs.get('linking_method', linking_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    # Determine the dimensionality
    spatial_dims = set("XYZ")
    D = len([d for d in dims if d in spatial_dims])

    # Adjust parameters based on dimensionality
    adjusted_epsPrev = eps_prev / downsample if eps_prev is not None else None
    adjusted_minClSz = int(min_clustersize / (downsample**D))
    adjusted_minSamples = int(min_samples / (downsample**D)) if min_samples is not None else None

    linker = Linker(
        eps=eps / downsample,
        eps_prev=adjusted_epsPrev,
        min_clustersize=adjusted_minClSz,
        min_samples=adjusted_minSamples,
        clustering_method=clustering_method,
        linking_method=linking_method,
        n_prev=n_prev,
        predictor=predictor,
        reg=reg,
        reg_m=reg_m,
        cost_threshold=cost_threshold,
        n_jobs=n_jobs,
        allow_merges=allow_merges,
        allow_splits=allow_splits,
        stability_threshold=stability_threshold,
        remove_small_clusters=remove_small_clusters,
        min_size_for_split=min_size_for_split,
    )
    tracker = ImageTracker(linker, downsample=downsample)
    # find indices of T in dims
    T_index = dims.upper().index("T")
    out = np.zeros_like(X, dtype=np.uint16)

    for i in tqdm(range(X.shape[T_index]), disable=not show_progress):
        out[i] = tracker.track_iteration(X[i])

    if any([allow_merges, allow_splits]):
        return out, linker.lineage_tracker

    return out

validation

Tools for validating detected collective events.

bootstrap_arcos(df, position_columns=['x'], frame_column='frame', obj_id_column='obj_id', measurement_column='m', method='shuffle_tracks', smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', eps=2, eps_prev=None, min_clustersize=1, n_prev=1, min_duration=1, min_total_size=1, stats_metric=['total_size', 'duration'], pval_alternative='greater', finite_correction=True, n=100, seed=42, allow_duplicates=False, max_tries=100, show_progress=True, verbose=False, parallel_processing=True, plot=True, **kwargs)

Bootstrap data using the ARCOS algorithm.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing the data to be bootstrapped.

required
position_columns list

List of column names containing the x and y coordinates.

['x']
frame_column str

Name of the column containing the frame number.

'frame'
obj_id_column str

Name of the column containing the track id.

'obj_id'
measurement_column str

Name of the column containing the measurement.

'm'
method str | list[str]

Method used for bootstrapping. Can be "shuffle_tracks", 'shuffle_timepoints', 'shift_timepoints', 'shuffle_binary_blocks', 'shuffle_coordinates_timepoint or a list of methods, which will be applied in order of index.

'shuffle_tracks'
smooth_k int

Smoothing kernel size.

3
bias_k int

Bias kernel size.

51
peak_threshold float

Threshold for peak detection.

0.2
binarization_threshold float

Threshold for binarization.

0.1
polynomial_degree int

Degree of the polynomial used for bias correction.

1
bias_method str

Bias correction method. Can be 'none', 'runmed', 'lm'

'runmed'
eps float

Epsilon parameter for DBSCAN.

2
eps_prev int | None

Parameter for linking tracks. If None, eps is used.

None
min_clustersize int

Minimum cluster size.

1
n_prev int

Number of previous frames to consider for linking.

1
min_duration int

Minimum duration of a track.

1
min_total_size int

Minimum size of a track.

1
stats_metric str | list[str]

Metric to calculate. Can be "duration", "total_size", "min_size", "max_size" or a list of metrics. Default is ["duration", "total_size"].

['total_size', 'duration']
pval_alternative str

Alternative hypothesis for the p-value calculation. Can be "less" or "greater".

'greater'
finite_correction bool

Correct p-values for finite sampling. Default is True.

True
n int

Number of bootstraps.

100
seed int

Seed for the random number generator.

42
allow_duplicates bool

If False, resampling will check if the resampled data contains duplicates. If True, duplicates will be allowed.

False
max_tries int

Maximum number of tries to resample data without duplicates.

100
show_progress bool

Show a progress bar.

True
verbose bool

Print additional information.

False
parallel_processing bool

Use parallel processing.

True
plot bool

Plot the distribution of the bootstrapped data.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - id_column: Deprecated. Use obj_id_column instead. - meas_column: Deprecated. Use measurement_column instead. - smoothK: Deprecated. Use smooth_k instead. - biasK: Deprecated. Use bias_k instead. - peakThr: Deprecated. Use peak_threshold instead. - binThr: Deprecated. Use binarization_threshold instead. - polyDeg: Deprecated. Use polynomial_degree instead. - biasMet: Deprecated. Use bias_method instead. - epsPrev: Deprecated. Use eps_prev instead. - minClsz: Deprecated. Use min_clustersize instead. - min_size: Deprecated. Use min_total_size instead. - paralell_processing: Deprecated. Use parallel_processing instead.

{}

Returns:

Type Description
DataFrame

DataFrame containing the bootstrapped data.

Source code in arcos4py/validation/_bootstrapping.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def bootstrap_arcos(
    df: pd.DataFrame,
    position_columns: list = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str = 'obj_id',
    measurement_column: str = 'm',
    method: str | list[str] = 'shuffle_tracks',
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    eps: float = 2,
    eps_prev: int | None = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    min_duration: int = 1,
    min_total_size: int = 1,
    stats_metric: str | list[str] = ["total_size", "duration"],
    pval_alternative: str = "greater",
    finite_correction: bool = True,
    n: int = 100,
    seed: int = 42,
    allow_duplicates: bool = False,
    max_tries: int = 100,
    show_progress: bool = True,
    verbose: bool = False,
    parallel_processing: bool = True,
    plot: bool = True,
    **kwargs,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Bootstrap data using the ARCOS algorithm.

    Arguments:
        df: DataFrame containing the data to be bootstrapped.
        position_columns: List of column names containing the x and y coordinates.
        frame_column: Name of the column containing the frame number.
        obj_id_column: Name of the column containing the track id.
        measurement_column: Name of the column containing the measurement.
        method: Method used for bootstrapping. Can be "shuffle_tracks", 'shuffle_timepoints', 'shift_timepoints',
            'shuffle_binary_blocks', 'shuffle_coordinates_timepoint or a list of methods,
            which will be applied in order of index.
        smooth_k: Smoothing kernel size.
        bias_k: Bias kernel size.
        peak_threshold: Threshold for peak detection.
        binarization_threshold: Threshold for binarization.
        polynomial_degree: Degree of the polynomial used for bias correction.
        bias_method: Bias correction method. Can be 'none', 'runmed', 'lm'
        eps: Epsilon parameter for DBSCAN.
        eps_prev: Parameter for linking tracks. If None, eps is used.
        min_clustersize: Minimum cluster size.
        n_prev: Number of previous frames to consider for linking.
        min_duration: Minimum duration of a track.
        min_total_size: Minimum size of a track.
        stats_metric: Metric to calculate. Can be "duration", "total_size", "min_size", "max_size" or a list of metrics.
            Default is ["duration", "total_size"].
        pval_alternative: Alternative hypothesis for the p-value calculation. Can be "less" or "greater".
        finite_correction: Correct p-values for finite sampling. Default is True.
        n: Number of bootstraps.
        seed: Seed for the random number generator.
        allow_duplicates: If False, resampling will check if the resampled data contains duplicates.
            If True, duplicates will be allowed.
        max_tries: Maximum number of tries to resample data without duplicates.
        show_progress: Show a progress bar.
        verbose: Print additional information.
        parallel_processing: Use parallel processing.
        plot: Plot the distribution of the bootstrapped data.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - id_column: Deprecated. Use obj_id_column instead.
            - meas_column: Deprecated. Use measurement_column instead.
            - smoothK: Deprecated. Use smooth_k instead.
            - biasK: Deprecated. Use bias_k instead.
            - peakThr: Deprecated. Use peak_threshold instead.
            - binThr: Deprecated. Use binarization_threshold instead.
            - polyDeg: Deprecated. Use polynomial_degree instead.
            - biasMet: Deprecated. Use bias_method instead.
            - epsPrev: Deprecated. Use eps_prev instead.
            - minClsz: Deprecated. Use min_clustersize instead.
            - min_size: Deprecated. Use min_total_size instead.
            - paralell_processing: Deprecated. Use parallel_processing instead.

    Returns:
        DataFrame containing the bootstrapped data.
    """
    map_deprecated_params = {
        "id_column": "obj_id_column",
        "meas_column": "measurement_column",
        "smoothK": "smooth_k",
        "biasK": "bias_k",
        "peakThr": "peak_threshold",
        "binThr": "binarization_threshold",
        "polyDeg": "polynomial_degree",
        "biasMet": "bias_method",
        "epsPrev": "eps_prev",
        "minClsz": "min_clustersize",
        "min_size": "min_total_size",
        "paralell_processing": "parallel_processing",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    smooth_k = updated_kwargs.get("smooth_k", smooth_k)
    bias_k = updated_kwargs.get("bias_k", bias_k)
    peak_threshold = updated_kwargs.get("peak_threshold", peak_threshold)
    binarization_threshold = updated_kwargs.get("binarization_threshold", binarization_threshold)
    polynomial_degree = updated_kwargs.get("polynomial_degree", polynomial_degree)
    bias_method = updated_kwargs.get("bias_method", bias_method)
    eps_prev = updated_kwargs.get("eps_prev", eps_prev)
    min_clustersize = updated_kwargs.get("min_clustersize", min_clustersize)
    min_total_size = updated_kwargs.get("min_total_size", min_total_size)
    parallel_processing = updated_kwargs.get("parallel_processing", parallel_processing)

    if not isinstance(stats_metric, list):
        stats_metric = [stats_metric]

    for stats_m in stats_metric:
        if stats_m not in [
            "duration",
            "total_size",
            "min_size",
            "max_size",
        ]:
            raise ValueError(f"Invalid metric: {stats_metric}")

    if pval_alternative not in ["less", "greater"]:
        raise ValueError(f"Invalid alternative hypothesis: {pval_alternative}")

    clid_name = 'clid'

    if isinstance(method, str):
        print(f'Resampling data using method "{method}"...')
    elif isinstance(method, list):
        print(f'Resampling data using methods "{method}"...')

    df_resampled = resample_data(
        data=df,
        position_columns=position_columns,
        frame_column=frame_column,
        obj_id_column=obj_id_column,
        measurement_column=measurement_column,
        method=method,
        n=n,
        seed=seed,
        allow_duplicates=allow_duplicates,
        max_tries=max_tries,
        show_progress=show_progress,
        verbose=verbose,
        parallel_processing=parallel_processing,
    )

    iterations = df_resampled['iteration'].unique()

    print(f'Running ARCOS and calculating "{stats_metric}"...')

    stats_df, stats_df_mean = calculate_arcos_stats(
        df_resampled=df_resampled,
        position_columns=position_columns,
        frame_column=frame_column,
        obj_id_column=obj_id_column,
        measurement_column=measurement_column,
        smooth_k=smooth_k,
        bias_k=bias_k,
        peak_threshold=peak_threshold,
        binarization_threshold=binarization_threshold,
        polynomial_degree=polynomial_degree,
        bias_method=bias_method,
        eps=eps,
        eps_prev=eps_prev,
        min_clustersize=min_clustersize,
        n_prev=n_prev,
        min_duration=min_duration,
        min_total_size=min_total_size,
        stats_metric=stats_metric,
        show_progress=show_progress,
        parallel_processing=parallel_processing,
        clid_column=clid_name,
        iterations=iterations,
    )
    df_p = calculate_pvalue(stats_df_mean, stats_metric, pval_alternative, finite_correction, plot)
    return stats_df, df_p

calculate_arcos_stats(df_resampled, iterations, position_columns=['x'], frame_column='frame', obj_id_column='obj_id', measurement_column='m', smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', eps=2, eps_prev=None, min_clustersize=1, n_prev=1, min_duration=1, min_total_size=1, stats_metric=['duration', 'total_size'], show_progress=True, parallel_processing=True, clid_column='clid', **kwargs)

Calculate the bootstrapped statistics.

Parameters:

Name Type Description Default
df_resampled DataFrame

Dataframe with resampled data.

required
iterations list[int]

List of iteration names, or range.

required
position_columns list

List of position columns..

['x']
frame_column str

Name of the frame column.

'frame'
obj_id_column str

Name of the id column.

'obj_id'
measurement_column str

Name of the measurement column.

'm'
smooth_k int

Smoothing kernel size for local detrending. Defaults to 3.

3
bias_k int

Bias kernel size for large scale detrending (used with biasMet='runmed'). Defaults to 51.

51
peak_threshold float

Peak threshold used for rescaling (used with biasMet='runmed'). Defaults to 0.2.

0.2
binarization_threshold float

Threshold for binarizing measurements after detrending. Defaults to 0.1.

0.1
polynomial_degree int

Polynomial degree used for detrending (used with biasMet='lm'). Defaults to 1.

1
bias_method str

Bias method, can be 'none', 'runmed', 'lm'. Defaults to "runmed".

'runmed'
eps float

Epsilon used for culstering active entities. Defaults to 2.

2
eps_prev int

Epsilon used for linking together culsters across time. Defaults to None.

None
min_clustersize int

Minimum cluster size. Defaults to 1.

1
n_prev int

Number of previous frames to consider when tracking clusters. Defaults to 1.

1
min_duration int

Minimum duration of detected event. Defaults to 1.

1
min_total_size int

Minimum size, minimum size of detected event. Defaults to 1.

1
stats_metric list[str]

List of metrics to calculate. Defaults to ['duration', 'total_size'].

['duration', 'total_size']
show_progress bool

Show progress bar. Defaults to True.

True
parallel_processing bool

Use paralell processing, uses the joblib package. Defaults to True.

True
clid_column str

Name of the cluster id column. Defaults to 'clid'.

'clid'
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - posCols: Deprecated. Use position_columns instead. - id_column: Deprecated. Use obj_id_column instead. - meas_column: Deprecated. Use measurement_column instead. - smoothK: Deprecated. Use smooth_k instead. - biasK: Deprecated. Use bias_k instead. - peakThr: Deprecated. Use peak_threshold instead. - binThr: Deprecated. Use binarization_threshold instead. - polyDeg: Deprecated. Use polynomial_degree instead. - biasMet: Deprecated. Use bias_method instead. - epsPrev: Deprecated. Use eps_prev instead. - minClsz: Deprecated. Use min_clustersize instead. - min_size: Deprecated. Use min_total_size instead. - nPrev: Deprecated. Use n_prev instead. - paralell_processing: Deprecated. Use parallel_processing instead.

{}

Returns:

Name Type Description
DataFrame DataFrame

Dataframe with the bootstrapped statistics.

DataFrame DataFrame

Dataframe with mean statistics.

Source code in arcos4py/validation/_bootstrapping.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
def calculate_arcos_stats(
    df_resampled: pd.DataFrame,
    iterations: list[int],
    position_columns: list = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str = 'obj_id',
    measurement_column: str = 'm',
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    eps: float = 2,
    eps_prev: int | None = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    min_duration: int = 1,
    min_total_size: int = 1,
    stats_metric: list[str] = ['duration', 'total_size'],
    show_progress: bool = True,
    parallel_processing: bool = True,
    clid_column: str = 'clid',
    **kwargs,
):
    """Calculate the bootstrapped statistics.

    Arguments:
        df_resampled (DataFrame): Dataframe with resampled data.
        iterations (list[int]): List of iteration names, or range.
        position_columns (list): List of position columns..
        frame_column (str): Name of the frame column.
        obj_id_column (str): Name of the id column.
        measurement_column (str): Name of the measurement column.
        smooth_k (int, optional): Smoothing kernel size for local detrending. Defaults to 3.
        bias_k (int, optional): Bias kernel size for large scale detrending (used with biasMet='runmed'). Defaults to 51.
        peak_threshold (float, optional): Peak threshold used for rescaling (used with biasMet='runmed'). Defaults to 0.2.
        binarization_threshold (float, optional): Threshold for binarizing measurements after detrending. Defaults to 0.1.
        polynomial_degree (int, optional): Polynomial degree used for detrending (used with biasMet='lm'). Defaults to 1.
        bias_method (str, optional): Bias method, can be 'none', 'runmed', 'lm'. Defaults to "runmed".
        eps (float, optional): Epsilon used for culstering active entities. Defaults to 2.
        eps_prev (int, optional): Epsilon used for linking together culsters across time. Defaults to None.
        min_clustersize (int, optional): Minimum cluster size. Defaults to 1.
        n_prev (int, optional): Number of previous frames to consider when tracking clusters. Defaults to 1.
        min_duration (int, optional): Minimum duration of detected event. Defaults to 1.
        min_total_size (int, optional): Minimum size, minimum size of detected event. Defaults to 1.
        stats_metric (list[str], optional): List of metrics to calculate. Defaults to ['duration', 'total_size'].
        show_progress (bool, optional): Show progress bar. Defaults to True.
        parallel_processing (bool, optional): Use paralell processing, uses the joblib package. Defaults to True.
        clid_column (str, optional): Name of the cluster id column. Defaults to 'clid'.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - posCols: Deprecated. Use position_columns instead.
            - id_column: Deprecated. Use obj_id_column instead.
            - meas_column: Deprecated. Use measurement_column instead.
            - smoothK: Deprecated. Use smooth_k instead.
            - biasK: Deprecated. Use bias_k instead.
            - peakThr: Deprecated. Use peak_threshold instead.
            - binThr: Deprecated. Use binarization_threshold instead.
            - polyDeg: Deprecated. Use polynomial_degree instead.
            - biasMet: Deprecated. Use bias_method instead.
            - epsPrev: Deprecated. Use eps_prev instead.
            - minClsz: Deprecated. Use min_clustersize instead.
            - min_size: Deprecated. Use min_total_size instead.
            - nPrev: Deprecated. Use n_prev instead.
            - paralell_processing: Deprecated. Use parallel_processing instead.

    Returns:
        DataFrame (pd.DataFrame): Dataframe with the bootstrapped statistics.
        DataFrame (pd.DataFrame): Dataframe with mean statistics.
    """
    map_deprecated_params = {
        "posCols": "position_columns",
        "id_column": "obj_id_column",
        "meas_column": "measurement_column",
        "smoothK": "smooth_k",
        "biasK": "bias_k",
        "peakThr": "peak_threshold",
        "binThr": "binarization_threshold",
        "polyDeg": "polynomial_degree",
        "biasMet": "bias_method",
        "epsPrev": "eps_prev",
        "minClsz": "min_clustersize",
        "nPrev": "n_prev",
        "min_size": "min_total_size",
        "paralell_processing": "parallel_processing",
        "clid_name": "clid_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    position_columns = updated_kwargs.get("position_columns", position_columns)
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    smooth_k = updated_kwargs.get("smooth_k", smooth_k)
    bias_k = updated_kwargs.get("bias_k", bias_k)
    peak_threshold = updated_kwargs.get("peak_threshold", peak_threshold)
    binarization_threshold = updated_kwargs.get("binarization_threshold", binarization_threshold)
    polynomial_degree = updated_kwargs.get("polynomial_degree", polynomial_degree)
    bias_method = updated_kwargs.get("bias_method", bias_method)
    min_total_size = updated_kwargs.get("min_total_size", min_total_size)
    parallel_processing = updated_kwargs.get("parallel_processing", parallel_processing)
    clid_column = updated_kwargs.get("clid_column", clid_column)
    min_clustersize = updated_kwargs.get("min_clustersize", min_clustersize)
    eps_prev = updated_kwargs.get("eps_prev", eps_prev)
    n_prev = updated_kwargs.get("n_prev", n_prev)

    if parallel_processing:
        from joblib import Parallel, delayed

        stats_df_list = Parallel(n_jobs=-1)(
            delayed(_apply_arcos)(
                i_iter=i_iter,
                df_resampled=df_resampled,
                position_columns=position_columns,
                frame_column=frame_column,
                obj_id_column=obj_id_column,
                measurement_column=measurement_column,
                smooth_k=smooth_k,
                bias_k=bias_k,
                peak_threshold=peak_threshold,
                binarization_threshold=binarization_threshold,
                polynomial_degree=polynomial_degree,
                bias_method=bias_method,
                eps=eps,
                eps_prev=eps_prev,
                min_clustersize=min_clustersize,
                n_prev=n_prev,
                min_duration=min_duration,
                min_total_size=min_total_size,
                clid_column=clid_column,
            )
            for i_iter in tqdm(iterations, disable=not show_progress)
        )
    else:
        stats_df_list = []
        for i_iter in tqdm(iterations, disable=not show_progress):
            stats_df = _apply_arcos(
                i_iter=i_iter,
                df_resampled=df_resampled,
                position_columns=position_columns,
                frame_column=frame_column,
                obj_id_column=obj_id_column,
                measurement_column=measurement_column,
                smooth_k=smooth_k,
                bias_k=bias_k,
                peak_threshold=peak_threshold,
                binarization_threshold=binarization_threshold,
                polynomial_degree=polynomial_degree,
                bias_method=bias_method,
                eps=eps,
                eps_prev=eps_prev,
                min_clustersize=min_clustersize,
                n_prev=n_prev,
                min_duration=min_duration,
                min_total_size=min_total_size,
                clid_column=clid_column,
            )
            stats_df_list.append(stats_df)

    stats_df = pd.concat(stats_df_list, ignore_index=True)

    stats_df_indexer = ['bootstrap_iteration'] + stats_metric
    stats_df_mean: pd.DataFrame = (
        stats_df[stats_df_indexer].groupby(['bootstrap_iteration']).agg(['mean']).reset_index()
    )
    stats_df_mean = stats_df_mean.droplevel(level=1, axis=1)
    # for bootstrap iteratoins that did not detect any events, set the metric to 0
    stats_df_mean[stats_metric] = stats_df_mean[stats_metric].fillna(0)
    return stats_df, stats_df_mean

calculate_pvalue(stats_df_mean, stats_metric, pval_alternative, finite_correction, plot, **plot_kwargs)

Calculates the p-value with the given alternative hypothesis.

Parameters:

Name Type Description Default
stats_df_mean DataFrame

DataFrame containing the bootstrapped data.

required
stats_metric str | list[str]

Metric to calculate. Can be "duration", "total_size", "min_size", "max_size" or a list of metrics. Default is ["duration", "total_size"].

required
pval_alternative str

Alternative hypothesis for the p-value calculation. Can be "less", "greater" or both which will return p values for both alternatives.

required
finite_correction bool

Correct p-values for finite sampling. Default is True.

required
plot bool

Plot the distribution of the bootstrapped data.

required

Returns:

Name Type Description
DataFrame DataFrame

containing the p-values.

Source code in arcos4py/validation/_bootstrapping.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def calculate_pvalue(
    stats_df_mean: pd.DataFrame,
    stats_metric: str | list[str],
    pval_alternative: str,
    finite_correction: bool,
    plot: bool,
    **plot_kwargs,
):
    """Calculates the p-value with the given alternative hypothesis.

    Arguments:
        stats_df_mean (DataFrame): DataFrame containing the bootstrapped data.
        stats_metric (str | list[str]): Metric to calculate.
            Can be "duration", "total_size", "min_size", "max_size" or a list of metrics.
            Default is ["duration", "total_size"].
        pval_alternative (str): Alternative hypothesis for the p-value calculation.
            Can be "less", "greater" or both which will return p values for both alternatives.
        finite_correction (bool): Correct p-values for finite sampling. Default is True.
        plot (bool): Plot the distribution of the bootstrapped data.

    Returns:
        DataFrame (pd.DataFrame): containing the p-values.
    """
    if finite_correction:
        pval = stats_df_mean[stats_metric].agg(lambda x: _p_val_finite_sampling(x, pval_alternative))
    else:
        pval = stats_df_mean[stats_metric].agg(lambda x: _p_val_infinite_sampling(x, pval_alternative))
    pval.name = 'p_value'

    if isinstance(stats_metric, list):
        _stats_metric = stats_metric
    else:
        _stats_metric = [stats_metric]

    mean_control = stats_df_mean[stats_metric].iloc[0]
    stats_df_mean = stats_df_mean[stats_df_mean['bootstrap_iteration'] != 0].reset_index(drop=True)

    if plot:
        fig, axis = plt.subplots(1, len(_stats_metric))
        try:
            iter(axis)
        except TypeError:
            axis = [axis]
        for idx, (ax, stats_col) in enumerate(zip(axis, _stats_metric)):
            # sns.kdeplot(stats_df_mean[stats_col], ax=ax, shade=True, sharey=True)
            sns.histplot(stats_df_mean[stats_col], ax=ax, kde=True, stat='density', common_norm=False, **plot_kwargs)
            # ax.hist(stats_df_mean[stats_col], alpha=0.5)
            ax.set_title(stats_col)
            ax.vlines(mean_control[stats_col], ymin=0, ymax=ax.get_ylim()[1], color='red', ls='--')
            ax.set_xlabel('Value')
            if len(axis) > 1 and idx == 0:
                ax.set_ylabel('Density')
            else:
                ax.set_ylabel('')
            x_pos = ax.get_xlim()[0] + ((ax.get_xlim()[1] - ax.get_xlim()[0]) * 0.7)
            y_pos = ax.get_ylim()[0] + ((ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.7)
            ax.text(
                x_pos,
                y_pos,
                f'p-value\n{pval[stats_col].values[0]:.3f}',
                ha='center',
                va='center',
                color='red',
            )
        fig.suptitle(f'Bootstrapped metrics: pval_alternative {pval.index[0]}')
        return pval, fig, axis
    return pval

resample_data(data, position_columns=['x'], frame_column='frame', obj_id_column='obj_id', measurement_column=None, method='shuffle_tracks', n=100, seed=42, allow_duplicates=False, max_tries=100, show_progress=True, verbose=False, parallel_processing=True, **kwargs)

Resamples data in order to perform bootstrapping analysis.

Parameters:

Name Type Description Default
data Dataframe

The data to resample.

required
position_columns list

The columns to use for the position.

['x']
frame_column str

The column to use for the frame.

'frame'
obj_id_column str

The column to use for the object ID.

'obj_id'
measurement_column str

The column to use for the measurement. Only needed for 'activity_blocks_shuffle'. Defaults to None.

None
method str

The method to use for resampling. Defaults to 'shuffle_tracks'. Available methods are: "shuffle_tracks", 'shuffle_timepoints', 'shift_timepoints', 'shuffle_binary_blocks', 'shuffle_coordinates_timepoint'

'shuffle_tracks'
n int

The number of resample iterations. Defaults to 100.

100
seed int

The random seed. Defaults to 42.

42
allow_duplicates bool

Whether to allow resampling to randomly generate the same data twice. Defaults to False.

False
max_tries int

The maximum number of tries to try ot generate unique data when allow_duplicates is set to True. Defaults to 100.

100
verbose bool

Whether to print progress. Defaults to False.

False
parallel_processing bool

Whether to use parallel processing. Defaults to True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - posCols (list): Deprecated. Use position_columns instead. - id_column (str): Deprecated. Use obj_id_column instead. - meas_column (str): Deprecated. Use measurement_column instead. - paralell_processing (bool): Deprecated. Use parallel_processing instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: The resampled data.

Source code in arcos4py/validation/_resampling.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def resample_data(  # noqa: C901
    data: pd.DataFrame,
    position_columns: list = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str = 'obj_id',
    measurement_column: Union[str, None] = None,
    method: Union[str, list[str]] = 'shuffle_tracks',
    n=100,
    seed=42,
    allow_duplicates=False,
    max_tries=100,
    show_progress=True,
    verbose=False,
    parallel_processing=True,
    **kwargs,
) -> pd.DataFrame:
    """Resamples data in order to perform bootstrapping analysis.

    Arguments:
        data (pd.Dataframe): The data to resample.
        position_columns (list): The columns to use for the position.
        frame_column (str): The column to use for the frame.
        obj_id_column (str): The column to use for the object ID.
        measurement_column (str, optional): The column to use for the measurement.
            Only needed for 'activity_blocks_shuffle'. Defaults to None.
        method (str, optional): The method to use for resampling. Defaults to 'shuffle_tracks'.
            Available methods are: "shuffle_tracks", 'shuffle_timepoints',
            'shift_timepoints', 'shuffle_binary_blocks', 'shuffle_coordinates_timepoint'
        n (int, optional): The number of resample iterations. Defaults to 100.
        seed (int, optional): The random seed. Defaults to 42.
        allow_duplicates (bool, optional): Whether to allow resampling to randomly generate the same data twice.
            Defaults to False.
        max_tries (int, optional): The maximum number of tries to try ot generate unique data
            when allow_duplicates is set to True. Defaults to 100.
        verbose (bool, optional): Whether to print progress. Defaults to False.
        parallel_processing (bool, optional): Whether to use parallel processing. Defaults to True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - posCols (list): Deprecated. Use position_columns instead.
            - id_column (str): Deprecated. Use obj_id_column instead.
            - meas_column (str): Deprecated. Use measurement_column instead.
            - paralell_processing (bool): Deprecated. Use parallel_processing instead.

    Returns:
        pd.DataFrame: The resampled data.
    """
    map_deprecated_params = {
        "posCols": "position_columns",
        "id_column": "obj_id_column",
        "meas_column": "measurement_column",
        "paralell_processing": "parallel_processing",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")
    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    position_columns = updated_kwargs.get("position_columns", position_columns)
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    parallel_processing = updated_kwargs.get("parallel_processing", parallel_processing)

    # validate the input
    if not isinstance(data, pd.DataFrame):
        raise TypeError('data must be a pandas.DataFrame')
    if not isinstance(position_columns, list):
        raise TypeError('posCols must be a list')
    if not isinstance(frame_column, str):
        raise TypeError('frame_column must be a string')
    if not isinstance(obj_id_column, str):
        raise TypeError('id_column must be a string')
    if not isinstance(measurement_column, str) and measurement_column is not None:
        raise TypeError('meas_column must be a string or None')
    if not isinstance(method, str) and not isinstance(method, list):
        raise TypeError('method must be a string or list')
    if not isinstance(n, int):
        raise TypeError('n must be a positive integer')
    if not isinstance(seed, int):
        raise TypeError('seed must be an integer')
    if not isinstance(verbose, bool):
        raise TypeError('verbose must be a boolean')
    if not isinstance(parallel_processing, bool):
        raise TypeError('paralell_processing must be a boolean')

    if len(position_columns) < 1:
        raise ValueError('posCols must contain at least one column')
    if n < 1:
        raise ValueError('n must be a positive integer')
    if seed < 0:
        raise ValueError('seed must be a positive integer')

    method_dict: dict[str, Callable] = {
        'shuffle_tracks': shuffle_tracks,
        'shuffle_timepoints': shuffle_timepoints,
        'shift_timepoints': shift_timepoints_per_trajectory,
        'shuffle_binary_blocks': shuffle_activity_bocks_per_trajectory,
        'shuffle_coordinates_timepoint': shuffle_coordinates_per_timepoint,
    }

    function_args: dict[str, tuple] = {
        'shuffle_tracks': (obj_id_column, position_columns, frame_column),
        'shuffle_timepoints': (obj_id_column, frame_column),
        'shift_timepoints': (obj_id_column, frame_column),
        'shuffle_binary_blocks': (obj_id_column, frame_column, measurement_column),
        'shuffle_coordinates_timepoint': (position_columns, frame_column),
    }

    resampling_func_list = []

    # convert method to list if necessary
    if isinstance(method, str):
        methods = [method]
    else:
        methods = method

    # Check if the method is valid
    for method in methods:
        if method not in method_dict.keys():
            raise ValueError(f'method must be one of {method_dict.keys()}')
        if method == 'shuffle_binary_blocks' and measurement_column is None:
            raise ValueError('meas_column must be set for binary_blocks_shuffle')

    # Check if the columns are in the data
    if 'shuffle_binary_blocks' in methods:
        relevant_columns = position_columns + [frame_column, obj_id_column, measurement_column]
    else:
        relevant_columns = position_columns + [frame_column, obj_id_column]

    for i in relevant_columns:
        if i not in data.columns:
            raise ValueError(f'{i} not in df.columns')

    # check if there are any Nan in the columns selected
    na_cols = []
    for i in relevant_columns:
        if data[position_columns].isnull().values.any():
            na_cols.append(i)
    if na_cols:
        warnings.warn(f'NaN values in {na_cols}, default behaviour is to drop these rows')
        data.dropna(subset=na_cols, inplace=True)

    # Sort the data
    data.sort_values([obj_id_column, frame_column], inplace=True)

    rng = np.random.default_rng(seed)
    # create a list of random numbers between 0 and 1000000
    seed_list = rng.integers(1_000_000_000, size=n)
    df_out: list[pd.DataFrame] = []
    # shuffle xy position for each object
    if verbose:
        print(f'Resampling for each object {n} times')

    # create a list of functions to call
    for method in methods:
        resampling_func_list.append(method_dict[method])
    iter_range = range(1, n + 1)
    if parallel_processing:
        from joblib import Parallel, delayed

        # iterate over the number of resamples
        df_out = Parallel(n_jobs=-1)(
            delayed(_apply_resampling)(
                iter_number=i,
                data=data,
                methods=methods,
                resampling_func_list=resampling_func_list,
                seed_list=seed_list,
                function_args=function_args,
            )
            for i in tqdm(iter_range, disable=not show_progress)
        )

    else:
        # iterate over the number of resamples
        for i in tqdm(iter_range, disable=not show_progress):
            data_new = _apply_resampling(
                iter_number=i,
                data=data,
                methods=methods,
                resampling_func_list=resampling_func_list,
                seed_list=seed_list,
                function_args=function_args,
            )
            if not allow_duplicates:
                current_try = 0
                # make sure that data_new is not already in df_out,
                # but they are both dataframes, else redo the resampling
                while any(
                    data_new.loc[:, data_new.columns != 'iteration'].equals(i.loc[:, i.columns != 'iteration'])
                    for i in df_out
                ):
                    current_try += 1
                    data_new = _apply_resampling(
                        iter_number=i,
                        data=data,
                        methods=methods,
                        resampling_func_list=resampling_func_list,
                        seed_list=seed_list,
                        function_args=function_args,
                    )
                    if current_try > max_tries:
                        raise ValueError(
                            'Could not find a unique resampling after 100 tries, try increasing n or allow_duplicates'
                        )

            df_out.append(data_new)

    data_it0 = data.copy()
    data_it0['iteration'] = np.repeat(0, len(data_it0))
    df_out.insert(0, data_it0)
    return pd.concat(df_out)[data.columns.tolist() + ['iteration']]