1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
|
import numpy as np
from scipy import sparse as sp
from AnyQt.QtCore import Qt, QRectF, QSizeF, QPointF, QLineF
from AnyQt.QtGui import QColor, QBrush, QPen
from AnyQt.QtWidgets import (
QGraphicsWidget,
QGraphicsRectItem,
QGraphicsLinearLayout,
QSizePolicy,
QGraphicsLineItem,
)
import Orange.statistics.util as ut
from Orange.data.util import one_hot
class BarItem(QGraphicsWidget):
"""A single bar in a histogram representing one single target value."""
def __init__(self, width, height, color, parent=None):
super().__init__(parent=parent)
self.width = width
self.height = height
self.color = color
if not isinstance(self.color, QColor):
self.color = QColor(self.color)
self.__rect = QGraphicsRectItem(0, 0, self.width, self.height, self)
self.__rect.setPen(QPen(Qt.NoPen))
self.__rect.setBrush(QBrush(self.color))
def boundingRect(self):
return self.__rect.boundingRect()
def sizeHint(self, which, constraint):
return self.boundingRect().size()
def sizePolicy(self):
return QSizePolicy(QSizePolicy.Expanding, QSizePolicy.Fixed)
class ProportionalBarItem(QGraphicsLinearLayout):
"""A bar that fills draws ``'BarItem'`` objects given some proportions.
Parameters
----------
distribution : np.ndarray
Contains the counts of individual target values that belong to the
particular bin. This can have length 1 if there is no target class.
colors : Optional[Iterable[QColor]]
If colors are passed, they must match the shape of the distribution.
The bars will be colored according to these values, where the indices
in the distribution must match the color indices.
bar_size : Union[int, float]
The width of the bar.
height : Union[int, float]
The height of the bar.
"""
def __init__(self, distribution, bar_size=10, height=100, colors=None):
super().__init__()
self.distribution = distribution
assert not colors or len(distribution) is len(colors), \
'If colors are provided, they must match the shape of distribution'
self.colors = colors
self.height = height
self.setOrientation(Qt.Vertical)
self._bar_size = bar_size
self.setSpacing(0)
self.setContentsMargins(0, 0, 0, 0)
self._draw_bars()
def _draw_bars(self):
heights, dist_sum = self.distribution, self.distribution.sum()
# If the number of instances within a column is not 0, divide by that
# sum to get the proportional height, otherwise set the height to 0
heights *= (dist_sum ** -1 if dist_sum != 0 else 0) * self.height
for idx, height in enumerate(heights):
color = self.colors[idx] if self.colors else QColor('#ccc')
self.addItem(BarItem(width=self._bar_size, height=height, color=color))
def sizeHint(self, which, constraint):
return QSizeF(self._bar_size, self.height)
def sizePolicy(self):
return QSizePolicy(QSizePolicy.Expanding, QSizePolicy.Fixed)
# The price of flexibility is complexity...
# pylint: disable=too-many-instance-attributes
class Histogram(QGraphicsWidget):
"""A basic histogram widget.
Parameters
----------
data : Table
variable : Union[int, str, Variable]
parent : QObject
height : Union[int, float]
width : Union[int, float]
side_padding : Union[int, float]
Specify the padding between the edges of the histogram and the
first and last bars.
top_padding : Union[int, float]
Specify the padding between the top of the histogram and the
highest bar.
bar_spacing : Union[int, float]
Specify the amount of spacing to place between individual bars.
border : Union[Tuple[Union[int, float]], int, float]
Can be anything that can go into the ``'QColor'`` constructor.
Draws a border around the entire histogram in a given color.
border_color : Union[QColor, str]
class_index : int
The index of the target variable in ``'data'``.
n_bins : int
"""
def __init__(self, data, variable, parent=None, height=200,
width=300, side_padding=5, top_padding=20, bottom_padding=0,
bar_spacing=4,
border=0, border_color=None, color_attribute=None, n_bins=10):
super().__init__(parent)
self.height, self.width = height, width
self.padding = side_padding
self.bar_spacing = bar_spacing
self.data = data
self.attribute = data.domain[variable]
self.x = data.get_column(self.attribute)
self.x_nans = np.isnan(self.x)
self.x = self.x[~self.x_nans]
if self.attribute.is_discrete:
self.n_bins = len(self.attribute.values)
elif self.attribute.is_continuous:
# If the attribute is continuous but contains fewer values than the
# bins, it is better to assign each their own bin. We will require
# at least 2 bins so that the histogram still visually makes sense
# except if there is only a single value, then we use 3 bins for
# symmetry
num_unique = ut.nanunique(self.x).shape[0]
if num_unique == 1:
self.n_bins = 3
else:
self.n_bins = min(max(2, num_unique), n_bins)
# Handle target variable index
self.color_attribute = color_attribute
if self.color_attribute is not None:
self.target_var = data.domain[color_attribute]
self.y = data.get_column(color_attribute)
self.y = self.y[~self.x_nans]
if not np.issubdtype(self.y.dtype, np.number):
self.y = self.y.astype(np.float64)
else:
self.target_var, self.y = None, None
# Borders
self.border_color = border_color if border_color is not None else '#000'
if isinstance(border, tuple):
assert len(border) == 4, 'Border tuple must be of size 4.'
self.border = border
else:
self.border = (border, border, border, border)
t, r, b, l = self.border
def _draw_border(point_1, point_2, border_width, parent):
pen = QPen(QColor(self.border_color))
pen.setCosmetic(True)
pen.setWidth(border_width)
line = QGraphicsLineItem(QLineF(point_1, point_2), parent)
line.setPen(pen)
return line
top_left = QPointF(0, 0)
bottom_left = QPointF(0, self.height)
top_right = QPointF(self.width, 0)
bottom_right = QPointF(self.width, self.height)
self.border_top = _draw_border(top_left, top_right, t, self) if t else None
self.border_bottom = _draw_border(bottom_left, bottom_right, b, self) if b else None
self.border_left = _draw_border(top_left, bottom_left, l, self) if l else None
self.border_right = _draw_border(top_right, bottom_right, r, self) if r else None
# _plot_`dim` accounts for all the paddings and spacings
self._plot_height = self.height
self._plot_height -= top_padding + bottom_padding
self._plot_height -= t / 4 + b / 4
self._plot_width = self.width
self._plot_width -= 2 * side_padding
self._plot_width -= (self.n_bins - 2) * bar_spacing
self._plot_width -= l / 4 + r / 4
self.__layout = QGraphicsLinearLayout(Qt.Horizontal, self)
self.__layout.setContentsMargins(
side_padding + r / 2,
top_padding + t / 2,
side_padding + l / 2,
bottom_padding + b / 2
)
self.__layout.setSpacing(bar_spacing)
# If the data contains any non-NaN values, we can draw a histogram
if self.x.size > 0:
self.edges, self.distributions = self._histogram()
self._draw_histogram()
def _get_histogram_edges(self):
"""Get the edges in the histogram based on the attribute type.
In case of a continuous variable, we split the variable range into
n bins. In case of a discrete variable, bins don't make sense, so we
just return the attribute values.
This will return the staring and ending edge, not just the edges in
between (in the case of a continuous variable).
Returns
-------
np.ndarray
"""
if self.attribute.is_discrete:
return np.array([self.attribute.to_val(v) for v in self.attribute.values])
else:
edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x), self.n_bins)
edge_diff = edges[1] - edges[0]
edges = np.hstack((edges, [edges[-1] + edge_diff]))
# If the variable takes on a single value, we still need to spit
# out some reasonable bin edges
if np.all(edges == edges[0]):
edges = np.array([edges[0] - 1, edges[0], edges[0] + 1])
return edges
def _get_bin_distributions(self, bin_indices):
"""Compute the distribution of instances within bins.
Parameters
----------
bin_indices : np.ndarray
An array with same shape as `x` but containing the bin index of the
instance.
Returns
-------
np.ndarray
A 2d array; the first dimension represents different bins, the
second - the counts of different target values.
"""
if self.target_var and self.target_var.is_discrete:
y = self.y
# TODO This probably also isn't the best handling of sparse data...
if sp.issparse(y):
y = np.squeeze(np.array(y.todense()))
# Since y can contain missing values, we need to filter them out as
# well as their corresponding `x` values
y_nan_mask = np.isnan(y)
y, bin_indices = y[~y_nan_mask], bin_indices[~y_nan_mask]
y = one_hot(y, dim=len(self.target_var.values))
bins = np.arange(self.n_bins)[:, np.newaxis]
mask = bin_indices == bins
distributions = np.zeros((self.n_bins, y.shape[1]))
for bin_idx in range(self.n_bins):
distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0)
else:
distributions, _ = ut.bincount(bin_indices.astype(np.int64))
# To keep things consistent across different variable types, we
# want to return a 2d array where the first dim represent different
# bins, and the second the distributions.
distributions = distributions[:, np.newaxis]
return distributions
def _histogram(self):
assert self.x.size > 0, 'Cannot calculate histogram on empty array'
edges = self._get_histogram_edges()
if self.attribute.is_discrete:
bin_indices = self.x
# TODO It probably isn't a very good idea to convert a sparse row
# to a dense array... Converts sparse to 1d numpy array
if sp.issparse(bin_indices):
bin_indices = np.squeeze(np.asarray(
bin_indices.todense(), dtype=np.int64
))
elif self.attribute.is_continuous:
bin_indices = ut.digitize(self.x, bins=edges[1:-1]).flatten()
distributions = self._get_bin_distributions(bin_indices)
return edges, distributions
def _draw_histogram(self):
# In case the data for the variable were all NaNs, then the
# distributions will be empty, and we don't need to display any bars
if self.x.size == 0:
return
# In case we have a target var, but the values are all NaNs, then there
# is no sense in displaying anything
if self.target_var:
y_nn = self.y[~np.isnan(self.y)]
if y_nn.size == 0:
return
if self.distributions.ndim > 1:
largest_bin_count = self.distributions.sum(axis=1).max()
else:
largest_bin_count = self.distributions.max()
bar_size = self._plot_width / self.n_bins
for distr, bin_colors in zip(self.distributions, self._get_colors()):
bin_count = distr.sum()
bar_height = bin_count / largest_bin_count * self._plot_height
bar_layout = QGraphicsLinearLayout(Qt.Vertical)
bar_layout.setSpacing(0)
bar_layout.addStretch()
self.__layout.addItem(bar_layout)
bar = ProportionalBarItem( # pylint: disable=blacklisted-name
distribution=distr, colors=bin_colors, height=bar_height,
bar_size=bar_size,
)
bar_layout.addItem(bar)
self.layout()
def _get_colors(self):
"""Compute colors for different kinds of histograms."""
target = self.target_var
if target and target.is_discrete:
colors = [list(target.palette)[:len(target.values)]] * self.n_bins
elif self.target_var and self.target_var.is_continuous:
palette = self.target_var.palette
bins = np.arange(self.n_bins)[:, np.newaxis]
edges = self.edges if self.attribute.is_discrete else self.edges[1:-1]
bin_indices = ut.digitize(self.x, bins=edges)
mask = bin_indices == bins
colors = []
for bin_idx in range(self.n_bins):
biny = self.y[mask[bin_idx]]
if np.isfinite(biny).any():
mean = ut.nanmean(biny) / ut.nanmax(self.y)
else:
mean = 0 # bin is empty, color does not matter
colors.append([palette.value_to_qcolor(mean)])
else:
colors = [[QColor('#ccc')]] * self.n_bins
return colors
def boundingRect(self):
return QRectF(0, 0, self.width, self.height)
def sizeHint(self, which, constraint):
return QSizeF(self.width, self.height)
def sizePolicy(self):
return QSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding)
if __name__ == '__main__':
import sys
from Orange.data.table import Table
from AnyQt.QtWidgets import ( # pylint: disable=ungrouped-imports
QGraphicsView, QGraphicsScene, QApplication, QWidget
)
app = QApplication(sys.argv)
widget = QWidget()
widget.resize(500, 300)
scene = QGraphicsScene(widget)
view = QGraphicsView(scene, widget)
dataset = Table(sys.argv[1] if len(sys.argv) > 1 else 'iris')
histogram = Histogram(
dataset, variable=0, height=300, width=500, n_bins=20, bar_spacing=2,
border=(0, 0, 5, 0), border_color='#000', color_attribute='iris',
)
scene.addItem(histogram)
widget.show()
app.exec()
|