Merge pull request #977 from campagnola/pseudoscatter-performance

Add a faster method for computing pseudoscatter
2020-06-13 21:08:33 -07:00 · 2020-06-13 21:08:33 -07:00 · ecd0642ca9
commit ecd0642ca9
parent 1744bb09a9 dc9aa84ce3
1 changed files with 52 additions and 4 deletions
--- a/pyqtgraph/functions.py
+++ b/pyqtgraph/functions.py
@ -2322,14 +2322,62 @@ def invertQTransform(tr):
            raise Exception("Transform is not invertible.")
        return inv[0]
 def pseudoScatter(data, spacing=None, shuffle=True, bidir=False, method='exact'):
    """Return an array of position values needed to make beeswarm or column scatter plots.
-def pseudoScatter(data, spacing=None, shuffle=True, bidir=False):
+    Used for examining the distribution of values in an array.
    """
    Used for examining the distribution of values in a set. Produces scattering as in beeswarm or column scatter plots.
-    Given a list of x-values, construct a set of y-values such that an x,y scatter-plot
+    Given an array of x-values, construct an array of y-values such that an x,y scatter-plot
    will not have overlapping points (it will look similar to a histogram).
    """
    if method == 'exact':
        return _pseudoScatterExact(data, spacing=spacing, shuffle=shuffle, bidir=bidir)
    elif method == 'histogram':
        return _pseudoScatterHistogram(data, spacing=spacing, shuffle=shuffle, bidir=bidir)
 def _pseudoScatterHistogram(data, spacing=None, shuffle=True, bidir=False):
    """Works by binning points into a histogram and spreading them out to fill the bin.
    Faster method, but can produce blocky results.
    """
    inds = np.arange(len(data))
    if shuffle:
        np.random.shuffle(inds)
    data = data[inds]
    if spacing is None:
        spacing = 2.*np.std(data)/len(data)**0.5
    yvals = np.empty(len(data))
    dmin = data.min()
    dmax = data.max()
    nbins = int((dmax-dmin) / spacing) + 1
    bins = np.linspace(dmin, dmax, nbins)
    dx = bins[1] - bins[0]
    dbins = ((data - bins[0]) / dx).astype(int)
    binCounts = {}
    for i,j in enumerate(dbins):
        c = binCounts.get(j, -1) + 1
        binCounts[j] = c
        yvals[i] = c
    if bidir is True:
        for i in range(nbins):
            yvals[dbins==i] -= binCounts.get(i, 0) * 0.5
    return yvals[np.argsort(inds)]  ## un-shuffle values before returning
 def _pseudoScatterExact(data, spacing=None, shuffle=True, bidir=False):
    """Works by stacking points up one at a time, searching for the lowest position available at each point.
    This method produces nice, smooth results but can be prohibitively slow for large datasets.
    """
    inds = np.arange(len(data))
    if shuffle:
        np.random.shuffle(inds)