I represent vectors with a lot of repeated elements by a index-vector and a value vector so that the index-vector contains the indices of value-changes in the vector and the value-vector contains the value at those indices.
For instance: [1, 1, 1, 7, 7, 4, 4]
is represented by index-vector [0, 3, 5]
and value-vector [1, 7, 4]
.
I have an algorithm for adding two such vectors, but it feels like it should be possible to do it in a cleaner and faster way. Are there any better ways to do this in python/numpy?
The current algorithm combines the diffs of the two arrays and does a cumsum in the end. But this requires alot of cleanup to make sure the there aren't any duplicated index-entries, and no consecutive equal values.
import numpy as np
class SparseVector:
def __init__(self, indices, values, sanitize=False):
self._indices = np.asanyarray(indices)
self._values = np.asanyarray(values)
if sanitize:
self._sanitize()
def __add__(self, other):
all_indices = np.r_[self._indices, other._indices]
args = np.argsort(all_indices, kind='mergesort')
diffs = np.r_[self._values[0], np.diff(self._values),
other._values[0], np.diff(other._values)]
diffs = diffs[args]
return self.__class__(all_indices[args], np.cumsum(diffs), True)
def _sanitize(self):
# Remove duplicate indexes
index_diffs = np.ediff1d(self._indices, to_end=1)
changes = index_diffs != 0
self._indices = self._indices[changes]
self._values = self._values[changes]
# Remove duplicated values
value_diffs = np.ediff1d(self._values, to_begin=1)
changes = value_diffs != 0
self._indices = self._indices[changes]
self._values = self._values[changes]
def __eq__(self, other):
return np.all(self._indices == other._indices)\
and np.all(self._values == other._values)
a = SparseVector([0, 1, 3, 5, 7, 10], [0, 10, 5, 8, 10, 11])
b = SparseVector([0, 2, 3, 6, 9, 10], [3, 2, 9, 7, 11, 10])
assert a+b == SparseVector([0, 1, 2, 3, 5, 6, 7, 9],
[3, 13, 12, 14, 17, 15, 17, 21])