sub_binding_tracker/data.py at main · SimonLab-RU/sub_binding_tracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
import csv
import pandas as pd
import statistics
from images import *
import os
import puncta_tracker as tracker
from blob_detector import *
from skimage import io
import datetime
import collections
import math


def write_dot_to_file(dot_id, data_array):
    # data_array must be a 3D array; dot_id must be an integer
    # dot is then saved as a text file to the Dots folder; to change file path, edit directly below
    # the read and write functions are basically from
    # https://stackoverflow.com/questions/3685265/how-to-write-a-multidimensional-array-to-a-text-file/18145279
    now = datetime.date.today()

    file_name = 'Dots/' + now.isoformat() + '/' + str(dot_id) + '.txt'
    with open(file_name, 'w') as outfile:

        # Any line starting with "#" will be ignored by numpy.loadtxt
        outfile.write('# Array shape: {0}\n'.format(data_array.shape))

        for data_slice in data_array:

            # The formatting string indicates that
            # the values in left-justified columns 7 characters in width
            # with 2 decimal places.
            np.savetxt(outfile, data_slice, fmt='%-7.2f')

            # Writing out a break to indicate different slices...
            outfile.write('# New slice\n')


def read_dot_file(path, dot_id, data_shape):
    # data shape is required to parse the data stored using "write_dot_to_file"
    # path is the string specifying the folder where the data are stored
    # include the final '/' in the path
    # ID needs to be an integer
    # data_shape is a three tuple in (depths, rows, columns)
    # returns data as a ndarray of the specified shape
    file_name = path + str(dot_id) + '.txt'
    data = np.loadtxt(file_name).reshape(data_shape)

    return data


def trace_csv_writer(file_path_name, trace_vector):
    # trace_vector is a 1D array of the measured trace
    # this for now is used for exporting trace data to MatLab for trace analysis;
    # file_path_name is a string specifying the output location as well as the file name
    # for example: "folder/file.csv"
    with open(file_path_name, 'a') as file:
        writer = csv.writer(file, delimiter=',')
        for i in range(len(trace_vector)):
            writer.writerow([trace_vector[i]])


def compile_trace_data(dots_database: pd.DataFrame, dot_trace_mapping: dict, frame_time_interval=1):
    # This function takes in dot_database (generated by find_puncta) and the dot_trace_mapping (generated by simple_
    # tracker) to produce a data_frame with the following columns: trace_id, ave_xcoor, ave_ycoor, first_frame,
    # last_frame, dwell_by_frame, dwell_time
    # The way the dataFrame is compiled is similar to how the dot_database is constructed
    traces = {'trace_ID': [],
              'ave_xcoor': [],
              'ave_ycoor': [],
              'msd': [],
              'first_frame': [],
              'last_frame': [],
              'dwell_by_frame': [],
              'dwell_time': []}

    # find out the range of trace_ids, as reflected by the numbering of the trace_mapping_dict
    number_of_traces = max(dot_trace_mapping.values())

    for current_trace_id in range(1, 1 + number_of_traces):
        # find all the dots associated with the trace_ID
        dots_list = []  # initialize the container for finding associated dots
        for dot_id, trace_id in dot_trace_mapping.items():
            if trace_id == current_trace_id:
                dots_list.append(dot_id)

        # after collecting relevant dot_ids, use dot_database to slice out the information of the associated dots
        # and transform the data needed for the trace
        filtered_dots_database = dots_database[dots_database['dot_ID'].isin(dots_list)]
        ave_xcoor = statistics.mean(filtered_dots_database['xcoor'])
        ave_ycoor = statistics.mean(filtered_dots_database['ycoor'])
        first_frame = min(filtered_dots_database['frame'])
        last_frame = max(filtered_dots_database['frame'])
        dwell_by_frame = last_frame - first_frame + 1
        dwell_time = dwell_by_frame * frame_time_interval

        # on 5/20/19, I added the new statistics to the function, mean square displacement (MSD)
        if dwell_by_frame > 1:
            msd = mean_square_displacement(filtered_dots_database['xcoor'], filtered_dots_database['ycoor'])
        else:
            msd = 0

        # add the calculated data to the traces database
        traces['trace_ID'].append(current_trace_id)
        traces['ave_xcoor'].append(ave_xcoor)
        traces['ave_ycoor'].append(ave_ycoor)
        traces['msd'].append(msd)
        traces['first_frame'].append(first_frame)
        traces['last_frame'].append(last_frame)
        traces['dwell_by_frame'].append(dwell_by_frame)
        traces['dwell_time'].append(dwell_time)

    return pd.DataFrame.from_dict(traces)


def filter_dot_database_by_gaussian_fit(dot_database: pd.DataFrame, image_stack, gaussian_fit_diameter,
                                        max_centroid_deviation, min_height_threshold, max_elliptic_aspect_ratio):
    # This function is depreciated

    """The function first generates a list of dot_ids that will be kept, and use this list to reduce the dot_database.
    The reduced database then is returned"""

    dots_to_keep = []

    for dot in dot_database.itertuples():
        quality = dot_gaussian_quality_filter(dot.xcoor, dot.ycoor, image_stack[dot.frame - 1],
                                              gaussian_fit_diameter,
                                              max_centroid_deviation,
                                              min_height_threshold,
                                              max_elliptic_aspect_ratio)

        if dot.dot_ID % 100 == 0:
            print('Gaussian-fitting dot #' + str(dot.dot_ID))

        if quality == 1:
            dots_to_keep.append(dot.dot_ID)

    filtered_dots = dot_database[dot_database['dot_ID'].isin(dots_to_keep)]
    filtered_dots.reset_index(drop=True)

    return filtered_dots


def export_trace_signals(trace_database: pd.DataFrame, image_stack: np.ndarray):
    # Plots are saved to a subfolder under "Output/"
    # Measuring disc size is not accessible from outside the function, and by default has a diameter of 9
    # signals are calculated as the mean intensity of the pixels within the measuring disc

    file_path = "Output/plots/"
    measuring_disc_diameter = 9  # this is the default value, change if necessary
    stack_depth = image_stack.shape[0]

    for trace in trace_database.itertuples():
        signals = [0]*stack_depth  # initialize signals
        for index in range(trace.first_frame - 1, trace.last_frame):
            signals[index] = 1  # mark the signals picked up by puncta tracker

        # Below are steps to measure an entire stack over at a specific (x,y)
        # first make an ROI mask to make everything outside of the mask to be 0
        # mask = create_circular_mask(source_h, source_w, measuring_disc_diameter)
        # then measure the mean using the 'measure_stack_mean' method defined in the images module
        mean = measure_stack_profile(int(round(trace.ave_xcoor)),
                                     int(round(trace.ave_ycoor)),
                                     image_stack,
                                     measuring_disc_diameter)

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        plt.clf()
        plt.title(str(trace.trace_ID))

        plt.subplot(211)
        plt.plot(np.arange(1, stack_depth + 1), mean, 'k-')
        plt.title('Input')

        plt.subplot(212)
        plt.step(np.arange(1, stack_depth + 1), signals, color='red', lw=2)
        plt.ylim(-0.2, 1.2)
        plt.title('Signals')

        plt.tight_layout()
        plt.savefig(file_path + str(trace.trace_ID), dpi=300)


def fill_obvious_gaps_in_traces(trace_database: pd.DataFrame, max_spatial_difference, max_gaps_allowed):
    """For data with dim signals but stable stage and low puncta density, it is possible to link gaps just by looking at
    the localization of the traces. This function contains some of the most confusing things I've ever done so be
    careful; back then I didn't know the existence of the drop function from Pandas"""

    # initialize a variable to store trace IDs for ones that need to be dropped;
    # also, initialize a dictionary to store temporarily the combined traces
    traces_to_drop = []
    edited_traces = {'trace_ID': [],
                     'ave_xcoor': [],
                     'ave_ycoor': [],
                     'msd': [],
                     'first_frame': [],
                     'last_frame': [],
                     'dwell_by_frame': [],
                     'dwell_time': []}

    # I have to assign a temporary trace ID for the combined traces, because otherwise I couldn't merge the combined
    # trace database with the original one. In other words, I need to keep datatype the same for individual columns

    current_trace_id = trace_database['trace_ID'].max() + 1

    for trace in trace_database.itertuples():

        # skip traces that are already processed using the if statement below
        if trace.trace_ID in traces_to_drop:
            continue

        overlapped = trace_database[(trace_database['ave_xcoor'] - trace.ave_xcoor)**2 +
                                    (trace_database['ave_ycoor'] - trace.ave_ycoor)**2 <=
                                    max_spatial_difference**2]

        # decide if the trace needs to combine: if the overlapped database returns exactly one, it means that the trace
        # has no duplicated dots at different temporal point
        if len(overlapped) == 1:
            continue

        # I need to write something to cap the max gap_filling allowed. Below is my second attempt. The attempt is
        # hardly elegant but this is a crucial function
        start_frames = np.asarray(overlapped['first_frame'])
        end_frames = np.asarray(overlapped['last_frame'])
        gaps = start_frames[1:] - end_frames[:-1]
        overlapped.reset_index(drop=True)
        slice_index = 1
        for gap in gaps:
            if gap > max_gaps_allowed:
                overlapped = overlapped[:1]
                break
            else:
                slice_index += 1

        # Check again after the previous operation.
        # This function is the worst I've ever written. I probably will get an F if I turn this in for homework
        if len(overlapped) == 1:
            continue

        first_frame = overlapped['first_frame'].min()
        last_frame = overlapped['last_frame'].max()
        dwell_by_frame = last_frame - first_frame + 1
        dwell_time = (last_frame - first_frame + 1) * (trace.dwell_time/trace.dwell_by_frame)

        # this mean is not accurate because it is not based on trace interpolation, but the difference is negligible
        weighted_xcoor = overlapped['ave_xcoor'] * overlapped['dwell_by_frame']
        weighted_ycoor = overlapped['ave_ycoor'] * overlapped['dwell_by_frame']
        ave_xcoor = weighted_xcoor.sum() / dwell_by_frame
        ave_ycoor = weighted_ycoor.sum() / dwell_by_frame

        # I don't know the correct way to merge msd without the raw data, so my best attempt here is to do a weighted
        # average for msd
        msd_weighted = overlapped['msd'] * overlapped['dwell_by_frame']
        msd = msd_weighted.sum() / dwell_by_frame

        traces_to_drop += overlapped['trace_ID'].tolist()

        edited_traces['trace_ID'].append(current_trace_id)
        edited_traces['ave_xcoor'].append(ave_xcoor)
        edited_traces['ave_ycoor'].append(ave_ycoor)
        edited_traces['msd'].append(msd)
        edited_traces['first_frame'].append(first_frame)
        edited_traces['last_frame'].append(last_frame)
        edited_traces['dwell_by_frame'].append(dwell_by_frame)
        edited_traces['dwell_time'].append(dwell_time)

        current_trace_id += 1

    # ~ is equivalent to "not". This statement below thus filters out the dots that appear in 'traces_to_drop' list
    edited_traces = pd.DataFrame.from_dict(edited_traces)

    if edited_traces.empty:
        return trace_database
    else:
        trimmed_trace_database = trace_database[~trace_database['trace_ID'].isin(traces_to_drop)]
        return pd.concat([trimmed_trace_database, edited_traces], ignore_index=True)


def remove_traces_with_long_gaps(trace_database: pd.DataFrame, max_spatial_difference, gap_threshold):
    # initialize a variable to store trace IDs for ones that need to be dropped;
    # also, initialize a dictionary to store temporarily the combined traces
    traces_to_drop = []

    for trace in trace_database.itertuples():

        # skip traces that are already processed using the if statement below
        if trace.trace_ID in traces_to_drop:
            continue

        overlapped = trace_database[(trace_database['ave_xcoor'] - trace.ave_xcoor) ** 2 +
                                    (trace_database['ave_ycoor'] - trace.ave_ycoor) ** 2 <=
                                    max_spatial_difference ** 2]

        # decide if there are traces broken up: if the overlapped database returns exactly one, it means that the trace
        # has no duplicated dots at different temporal point other than itself
        if len(overlapped) == 1:
            continue

        # Otherwise, update the overlapped list to get rid of traces that have been processed in previous iterations
        overlapped = overlapped[overlapped['trace_ID'] >= trace.trace_ID]

        # if overlapped contain more than one traces, it means that this was probably a single trace broken up due to
        # poor signal quality. It is likely that there are more than one break point caused by this.
        # The first problem here is to calculate the length of gaps between these overlapped traces:

        # find all the start frames
        start_frames = np.asarray(overlapped['first_frame'])

        # find all the end frames
        end_frames = np.asarray(overlapped['last_frame'])

        # use start-frame number of the (n+1)th trace to subtract the end-frame number of the nth trace. This creates
        # a list of gaps
        gaps = start_frames[1:] - end_frames[:-1]

        # now, examine the gap against the max-gap parameter set by the function argument. Use reset_index on
        # the gaps list to allow easy retrieval of specific traces involved
        overlapped.reset_index(drop=True)
        trace_to_drop_index = 0

        for gap in gaps:
            if gap < gap_threshold:
                # if a gap is found to be smaller than the max-gap allowed, this means that the two traces flanking this
                # gap is now consdered a single trace broken up into two. Hence, both of these traces need to be included
                # in the traces_to_drop list
                traces_to_drop.append(overlapped.iloc[trace_to_drop_index]['trace_ID'])
                traces_to_drop.append(overlapped.iloc[trace_to_drop_index + 1]['trace_ID'])
                trace_to_drop_index += 1
            else:
                # if a gap is found to be greater than the max-gap allowed, this means that the two traces flanking this
                # gap is now considered to be two separate traces, and hence should be excluded from the overlapped list
                trace_to_drop_index += 1

    # finally, remove duplicates in traces_to_drop list, by converting it do dictionary and then back to list
    traces_to_drop = list(dict.fromkeys(traces_to_drop))

    filtered = trace_database[~trace_database['trace_ID'].isin(traces_to_drop)]

    return filtered


def add_gaussian_fit_params_to_dot_database(dot_database: pd.DataFrame, image_stack, gaussian_fit_diameter):
    """the function takes a dot_database, and return a dot_database added with dot gaussian_fit parameters"""

    dot_database.reset_index(drop=True)

    gaussian_params = {'height': [],
                       'gaussian_x': [],
                       'gaussian_y': [],
                       'squared_deviation_from_centroid': [],
                       'gaussian_width_x': [],
                       'gaussian_width_y': [],
                       'elliptic_aspect_ratio': []}

    counter = 0
    for dot in dot_database.itertuples():

        if counter % 1000 == 0:
            current_time = datetime.datetime.now()
            print('Gaussian-fitted {} dots at {}'.format(counter, current_time.strftime("%H:%M:%S")))

        height, x, y, width_x, width_y = dot_gaussian_fit(dot.xcoor, dot.ycoor,
                                                          image_stack[dot.frame - 1], gaussian_fit_diameter)
        centroid_x = centroid_y = gaussian_fit_diameter / 2
        squared_deviation_from_centroid = (x - centroid_x) ** 2 + (y - centroid_y) ** 2
        elliptic_aspect_ratio = max(width_x, width_y) / min(width_x, width_y)

        gaussian_params['height'].append(height)
        gaussian_params['gaussian_x'].append(x)
        gaussian_params['gaussian_y'].append(y)
        gaussian_params['squared_deviation_from_centroid'].append(squared_deviation_from_centroid)
        gaussian_params['gaussian_width_x'].append(width_x)
        gaussian_params['gaussian_width_y'].append(width_y)
        gaussian_params['elliptic_aspect_ratio'].append(elliptic_aspect_ratio)

        counter += 1

    gaussian_params = pd.DataFrame.from_dict(gaussian_params)

    return pd.concat([dot_database, gaussian_params], axis=1)


def exclude_puncta_using_negative_control(experimental: pd.DataFrame,
                                          puncta_from_negative_control: pd.DataFrame,
                                          min_distance: int):
    # min_distance is used to specify the radius from puncta in NC data; puncta from experimental data will be dropped
    # if the puncta lines within the min_distance from puncta in NC
    # the experimental data frame is a database of traces; the negative_control

    filtered_data = experimental  # I prefer not to work directly on the original data

    for negative_control_dot in puncta_from_negative_control.itertuples():
        filtered_data = filtered_data[(filtered_data['ave_xcoor'] - negative_control_dot.xcoor)**2 +
                                      (filtered_data['ave_ycoor'] - negative_control_dot.ycoor)**2 >=
                                      min_distance**2]

    return filtered_data


def gather_dots(data_image_file_name, blob_threshold):
    print('currently finding dots in sample...' + data_image_file_name)
    data_image = io.imread(data_image_file_name)
    data_image = np.asarray(data_image, dtype=np.float64)

    dots = tracker.find_puncta(data_image,
                               target_radius=5,
                               blob_min_radius=3,
                               blob_threshold=blob_threshold)
    return dots


def filter_dots(dot_dataframe,
                mean_threshold=None,
                max_blob_r=None,
                max_intensity=None,
                max_gaussian_deviation=None,
                max_elliptic_ratio=None,
                min_gaussian_height=None,
                max_gaussian_height=None,
                max_mean_height_percentage_difference=None,
                filter_dots_on_first_and_last_frames=False):

    # pass the original data to the local variable filtered_dots
    filtered_dots = dot_dataframe
    last_frame = int(dot_dataframe['frame'].max())

    if not filter_dots_on_first_and_last_frames:

        filtered_dots = filtered_dots[filtered_dots['frame'] > 1]
        filtered_dots = filtered_dots[filtered_dots['frame'] < last_frame]

    if max_blob_r is not None:
        filtered_dots = filtered_dots[filtered_dots['blob_r'] < max_blob_r]

    if mean_threshold is not None:
        filtered_dots = filtered_dots[filtered_dots['mean_intensity'] > mean_threshold]

    if max_intensity is not None:
        filtered_dots = filtered_dots[filtered_dots['mean_intensity'] < max_intensity]

    if 'height' in dot_dataframe.keys() and min_gaussian_height is not None:
        filtered_dots = filtered_dots[filtered_dots['height'] > min_gaussian_height]

    if 'height' in dot_dataframe.keys() and max_gaussian_height is not None:
        filtered_dots = filtered_dots[filtered_dots['height'] < max_gaussian_height]

    if 'elliptic_aspect_ratio' in dot_dataframe.keys() and max_elliptic_ratio is not None:
        filtered_dots = filtered_dots[filtered_dots['elliptic_aspect_ratio'] < max_elliptic_ratio]

    if 'squared_deviation_from_centroid' in dot_dataframe.keys() and max_gaussian_deviation is not None:
        filtered_dots = filtered_dots[filtered_dots['squared_deviation_from_centroid'] < max_gaussian_deviation ** 2]

    if max_mean_height_percentage_difference is not None:

        filtered_dots = filtered_dots[filtered_dots['mean_intensity'] / filtered_dots['height'] >
                                      (1-max_mean_height_percentage_difference)]

        filtered_dots = filtered_dots[filtered_dots['mean_intensity'] / filtered_dots['height'] <
                                      (1+max_mean_height_percentage_difference)]

    if not filter_dots_on_first_and_last_frames:
        first_frame_dots = dot_dataframe[dot_dataframe['frame'] == 1]
        last_frame_dots = dot_dataframe[dot_dataframe['frame'] == last_frame]
        combined_dots = pd.concat([first_frame_dots, filtered_dots, last_frame_dots], join='inner')
        return combined_dots
    else:
        return filtered_dots


def filter_trace_by_start_end_and_xy(trace_data: pd.DataFrame, min_x, max_x, min_y, max_y):
    last_frame = trace_data['last_frame'].max()
    filtered_traces = trace_data[trace_data['first_frame'] > 1]
    filtered_traces = filtered_traces[filtered_traces['last_frame'] < last_frame]
    filtered_traces = filtered_traces[filtered_traces['ave_xcoor'] > min_x]
    filtered_traces = filtered_traces[filtered_traces['ave_xcoor'] < max_x]
    filtered_traces = filtered_traces[filtered_traces['ave_ycoor'] > min_y]
    filtered_traces = filtered_traces[filtered_traces['ave_ycoor'] < max_y]

    return filtered_traces


def count_instances(input_dataframe: pd.DataFrame, column_name: str):

    cnt = collections.Counter()

    for element in input_dataframe[column_name]:
        cnt[element] += 1

    elements = []
    counts = []

    for key, value in cnt.items():
        elements.append(key)
        counts.append(value)

    output = {column_name: elements,
              'occurrences': counts}

    output = pd.DataFrame.from_dict(output)
    output = output.sort_values(by=column_name)

    return output.reset_index(drop=True)

def filter_traces_sample_space(trace_data: pd.DataFrame, dwell_time_limit=None):
    # trace_data should be filtered already
    interval = trace_data['dwell_time'].reindex().iloc[0] / trace_data['dwell_by_frame'].reindex().iloc[0]
    last_frame = max(trace_data['last_frame'])
    max_dwell_time = last_frame * interval

    if dwell_time_limit is None:
        dwell_time_limit = math.ceil(max_dwell_time/2)

    first_frame_limit = math.ceil((max_dwell_time - dwell_time_limit)/interval)
    filtered_traces = trace_data[trace_data['first_frame'] <= first_frame_limit]
    filtered_traces = filtered_traces[filtered_traces['dwell_time'] < dwell_time_limit]

    return filtered_traces


def remove_concurrent_and_overlapping_traces(trace_database: pd.DataFrame,
                                             spatial_difference_threshold):
    # initialize a variable to store trace IDs for ones that need to be dropped;
    traces_to_drop = []

    for trace in trace_database.itertuples():

        # skip traces that are already processed using the if statement below
        if trace.trace_ID in traces_to_drop:
            continue

        # first, find if there are concurrent traces that appear before the trace in question in its vicinity
        overlapped = trace_database[trace_database['first_frame'] < trace.first_frame]
        overlapped = overlapped[overlapped['last_frame'] >= trace.first_frame]
        # these overlapped traces won't include our trace in question
        # then, among these candidates, search for spatial proximity

        overlapped = overlapped[(overlapped['ave_xcoor'] - trace.ave_xcoor) ** 2 +
                                (overlapped['ave_ycoor'] - trace.ave_ycoor) ** 2 <=
                                spatial_difference_threshold ** 2]

        # Test 1: If the overlapped database returns zero, it means that the trace in question has no one in
        # its vicinity before this trace appears
        if len(overlapped) == 0:
            # Test 2:
            # the next scenario is whether there are new traces that happen to start simultaneously on the same frame as
            # the trace in question that are also in close proximity

            overlapped = trace_database[trace_database['first_frame'] == trace.first_frame]
            overlapped = overlapped[(overlapped['ave_xcoor'] - trace.ave_xcoor) ** 2 +
                                    (overlapped['ave_ycoor'] - trace.ave_ycoor) ** 2 <=
                                    spatial_difference_threshold ** 2]

            # if the overlapped database returns one, it means that there are no other traces in close proximity to the
            # trace in question, in this case, move on to the next execution of the for loop
            if len(overlapped) == 1:
                continue

            # otherwise, add these traces to the list for failing Test 2.
            else:
                trace_IDs_list = list(overlapped['trace_ID'])
                traces_to_drop += trace_IDs_list
        # otherwise, this trace in question needs to be added to the drop off list for failing Test 1.
        else:
            # update the overlapped list to get rid of traces
            traces_to_drop.append(trace.trace_ID)

    # finally, remove duplicates in traces_to_drop list, by converting it do dictionary and then back to list
    traces_to_drop = list(dict.fromkeys(traces_to_drop))

    filtered = trace_database[~trace_database['trace_ID'].isin(traces_to_drop)]

    return filtered