1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18  """ 
 19  MLlib utilities for linear algebra. For dense vectors, MLlib 
 20  uses the NumPy C{array} type, so you can simply pass NumPy arrays 
 21  around. For sparse vectors, users can construct a L{SparseVector} 
 22  object from MLlib or pass SciPy C{scipy.sparse} column vectors if 
 23  SciPy is available in their environment. 
 24  """ 
 25   
 26  from numpy import array, array_equal, ndarray, float64, int32 
 30      """ 
 31      A simple sparse vector class for passing data to MLlib. Users may 
 32      alternatively pass SciPy's {scipy.sparse} data types. 
 33      """ 
 34   
 36          """ 
 37          Create a sparse vector, using either a dictionary, a list of 
 38          (index, value) pairs, or two separate arrays of indices and 
 39          values (sorted by index). 
 40   
 41          @param size: Size of the vector. 
 42          @param args: Non-zero entries, as a dictionary, list of tupes, 
 43                 or two sorted lists containing indices and values. 
 44   
 45          >>> print SparseVector(4, {1: 1.0, 3: 5.5}) 
 46          [1: 1.0, 3: 5.5] 
 47          >>> print SparseVector(4, [(1, 1.0), (3, 5.5)]) 
 48          [1: 1.0, 3: 5.5] 
 49          >>> print SparseVector(4, [1, 3], [1.0, 5.5]) 
 50          [1: 1.0, 3: 5.5] 
 51          """ 
 52          self.size = int(size) 
 53          assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments" 
 54          if len(args) == 1: 
 55              pairs = args[0] 
 56              if type(pairs) == dict: 
 57                  pairs = pairs.items() 
 58              pairs = sorted(pairs) 
 59              self.indices = array([p[0] for p in pairs], dtype=int32) 
 60              self.values = array([p[1] for p in pairs], dtype=float64) 
 61          else: 
 62              assert len(args[0]) == len(args[1]), "index and value arrays not same length" 
 63              self.indices = array(args[0], dtype=int32) 
 64              self.values = array(args[1], dtype=float64) 
 65              for i in xrange(len(self.indices) - 1): 
 66                  if self.indices[i] >= self.indices[i + 1]: 
 67                      raise TypeError("indices array must be sorted") 
  68   
 69 -    def dot(self, other): 
  70          """ 
 71          Dot product with a SparseVector or 1- or 2-dimensional Numpy array. 
 72   
 73          >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 
 74          >>> a.dot(a) 
 75          25.0 
 76          >>> a.dot(array([1., 2., 3., 4.])) 
 77          22.0 
 78          >>> b = SparseVector(4, [2, 4], [1.0, 2.0]) 
 79          >>> a.dot(b) 
 80          0.0 
 81          >>> a.dot(array([[1, 1], [2, 2], [3, 3], [4, 4]])) 
 82          array([ 22.,  22.]) 
 83          """ 
 84          if type(other) == ndarray: 
 85              if other.ndim == 1: 
 86                  result = 0.0 
 87                  for i in xrange(len(self.indices)): 
 88                      result += self.values[i] * other[self.indices[i]] 
 89                  return result 
 90              elif other.ndim == 2: 
 91                  results = [self.dot(other[:, i]) for i in xrange(other.shape[1])] 
 92                  return array(results) 
 93              else: 
 94                  raise Exception("Cannot call dot with %d-dimensional array" % other.ndim) 
 95          else: 
 96              result = 0.0 
 97              i, j = 0, 0 
 98              while i < len(self.indices) and j < len(other.indices): 
 99                  if self.indices[i] == other.indices[j]: 
100                      result += self.values[i] * other.values[j] 
101                      i += 1 
102                      j += 1 
103                  elif self.indices[i] < other.indices[j]: 
104                      i += 1 
105                  else: 
106                      j += 1 
107              return result 
 108   
110          """ 
111          Squared distance from a SparseVector or 1-dimensional NumPy array. 
112   
113          >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 
114          >>> a.squared_distance(a) 
115          0.0 
116          >>> a.squared_distance(array([1., 2., 3., 4.])) 
117          11.0 
118          >>> b = SparseVector(4, [2, 4], [1.0, 2.0]) 
119          >>> a.squared_distance(b) 
120          30.0 
121          >>> b.squared_distance(a) 
122          30.0 
123          """ 
124          if type(other) == ndarray: 
125              if other.ndim == 1: 
126                  result = 0.0 
127                  j = 0    
128                  for i in xrange(other.shape[0]): 
129                      if j < len(self.indices) and self.indices[j] == i: 
130                          diff = self.values[j] - other[i] 
131                          result += diff * diff 
132                          j += 1 
133                      else: 
134                          result += other[i] * other[i] 
135                  return result 
136              else: 
137                  raise Exception("Cannot call squared_distance with %d-dimensional array" % 
138                                  other.ndim) 
139          else: 
140              result = 0.0 
141              i, j = 0, 0 
142              while i < len(self.indices) and j < len(other.indices): 
143                  if self.indices[i] == other.indices[j]: 
144                      diff = self.values[i] - other.values[j] 
145                      result += diff * diff 
146                      i += 1 
147                      j += 1 
148                  elif self.indices[i] < other.indices[j]: 
149                      result += self.values[i] * self.values[i] 
150                      i += 1 
151                  else: 
152                      result += other.values[j] * other.values[j] 
153                      j += 1 
154              while i < len(self.indices): 
155                  result += self.values[i] * self.values[i] 
156                  i += 1 
157              while j < len(other.indices): 
158                  result += other.values[j] * other.values[j] 
159                  j += 1 
160              return result 
 161   
163          inds = self.indices 
164          vals = self.values 
165          entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))]) 
166          return "[" + entries + "]" 
 167   
169          inds = self.indices 
170          vals = self.values 
171          entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))]) 
172          return "SparseVector({0}, {{{1}}})".format(self.size, entries) 
 173   
175          """ 
176          Test SparseVectors for equality. 
177   
178          >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)]) 
179          >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) 
180          >>> v1 == v2 
181          True 
182          >>> v1 != v2 
183          False 
184          """ 
185   
186          return (isinstance(other, self.__class__) 
187                  and other.size == self.size 
188                  and array_equal(other.indices, self.indices) 
189                  and array_equal(other.values, self.values)) 
 190   
192          return not self.__eq__(other) 
  193   
196      """ 
197      Factory methods for working with vectors. Note that dense vectors 
198      are simply represented as NumPy array objects, so there is no need 
199      to covert them for use in MLlib. For sparse vectors, the factory 
200      methods in this class create an MLlib-compatible type, or users 
201      can pass in SciPy's C{scipy.sparse} column vectors. 
202      """ 
203   
204      @staticmethod 
206          """ 
207          Create a sparse vector, using either a dictionary, a list of 
208          (index, value) pairs, or two separate arrays of indices and 
209          values (sorted by index). 
210   
211          @param size: Size of the vector. 
212          @param args: Non-zero entries, as a dictionary, list of tupes, 
213                       or two sorted lists containing indices and values. 
214   
215          >>> print Vectors.sparse(4, {1: 1.0, 3: 5.5}) 
216          [1: 1.0, 3: 5.5] 
217          >>> print Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) 
218          [1: 1.0, 3: 5.5] 
219          >>> print Vectors.sparse(4, [1, 3], [1.0, 5.5]) 
220          [1: 1.0, 3: 5.5] 
221          """ 
222          return SparseVector(size, *args) 
 223   
224      @staticmethod 
226          """ 
227          Create a dense vector of 64-bit floats from a Python list. Always 
228          returns a NumPy array. 
229   
230          >>> Vectors.dense([1, 2, 3]) 
231          array([ 1.,  2.,  3.]) 
232          """ 
233          return array(elements, dtype=float64) 
  234   
237      import doctest 
238      (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS) 
239      if failure_count: 
240          exit(-1) 
 241   
242  if __name__ == "__main__": 
243      _test() 
244