FlashGraph-ng
A new frontier in large-scale graph analysis and data mining
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Pages
sparse_matrix_format.h
1 #ifndef __SPARSE_MATRIX_FORMAT_H__
2 #define __SPARSE_MATRIX_FORMAT_H__
3 
4 /*
5  * Copyright 2014 Open Connectome Project (http://openconnecto.me)
6  * Written by Da Zheng (zhengda1936@gmail.com)
7  *
8  * This file is part of FlashMatrix.
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  *
14  * http://www.apache.org/licenses/LICENSE-2.0
15  *
16  * Unless required by applicable law or agreed to in writing, software
17  * distributed under the License is distributed on an "AS IS" BASIS,
18  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  * See the License for the specific language governing permissions and
20  * limitations under the License.
21  */
22 
23 #include "in_mem_io.h"
24 #include "io_interface.h"
25 
26 #include "vertex.h"
27 #include "matrix_header.h"
28 #include "vector_vector.h"
29 
30 namespace fm
31 {
32 
33 /*
34  * This iterates the edges in a row part.
35  */
36 class rp_edge_iterator
37 {
38  uint16_t rel_row_idx;
39  // This always points to the beginning of the row part.
40  uint16_t *rel_col_idx_start;
41  // This points to the current location of the iterator on the row part.
42  uint16_t *rel_col_idx_p;
43  // This points to the first non-zero entry in the row part.
44  const char *data_start;
45  size_t entry_size;
46 public:
47  rp_edge_iterator() {
48  rel_row_idx = 0;
49  rel_col_idx_start = NULL;
50  rel_col_idx_p = NULL;
51  data_start = NULL;
52  entry_size = 0;
53  }
54 
55  rp_edge_iterator(uint16_t rel_row_idx, uint16_t *rel_col_idx_start) {
56  this->rel_row_idx = rel_row_idx;
57  this->rel_col_idx_start = rel_col_idx_start;
58  this->rel_col_idx_p = rel_col_idx_start;
59  this->data_start = NULL;
60  this->entry_size = 0;
61  }
62 
63  rp_edge_iterator(uint16_t rel_row_idx, uint16_t *rel_col_idx_start,
64  const char *data_start, size_t entry_size) {
65  this->rel_row_idx = rel_row_idx;
66  this->rel_col_idx_start = rel_col_idx_start;
67  this->rel_col_idx_p = rel_col_idx_start;
68  this->data_start = data_start;
69  this->entry_size = entry_size;
70  }
71 
72  bool is_valid() const {
73  return rel_col_idx_start != NULL;
74  }
75 
76  size_t get_rel_row_idx() const {
77  return rel_row_idx;
78  }
79 
80  off_t get_offset() const {
81  return rel_col_idx_p - rel_col_idx_start;
82  }
83 
84  size_t get_entry_size() const {
85  return entry_size;
86  }
87 
88  bool has_next() const {
89  // The highest bit of the relative col idx has to be 0.
90  return *rel_col_idx_p <= (size_t) std::numeric_limits<int16_t>::max();
91  };
92 
93  // This returns the relative column index.
94  size_t get_curr() const {
95  return *rel_col_idx_p;
96  }
97 
98  template<class T>
99  T get_curr_data() const {
100  assert(data_start && entry_size == sizeof(T));
101  return *(((const T *) data_start) + (rel_col_idx_p - rel_col_idx_start));
102  }
103 
104  // This returns the relative column index.
105  size_t next() {
106  size_t ret = *rel_col_idx_p;
107  rel_col_idx_p++;
108  return ret;
109  }
110 
111  void append(const block_2d_size &block_size, size_t col_idx) {
112  *rel_col_idx_p = col_idx & block_size.get_ncol_mask();
113  assert(*rel_col_idx_p <= (size_t) std::numeric_limits<int16_t>::max());
114  rel_col_idx_p++;
115  }
116 
117  const char *get_curr_addr() const {
118  return (const char *) rel_col_idx_p;
119  }
120 
121  const char *get_curr_data() const {
122  return data_start + (rel_col_idx_p - rel_col_idx_start) * entry_size;
123  }
124 };
125 
126 typedef std::pair<size_t, size_t> coo_nz_t;
127 
128 /*
129  * This stores the header of a row part. It doesn't contain any attributes.
130  */
131 class sparse_row_part
132 {
133  static const int num_bits = sizeof(uint16_t) * 8;
134  uint16_t rel_row_idx;
135  uint16_t rel_col_idxs[0];
136 
137  // The row part can only be allocated in the heap. The copy constructor
138  // doesn't make any sense for this class.
139  sparse_row_part(const sparse_row_part &part) = delete;
140 public:
141  static size_t get_size(size_t num_non_zeros) {
142  return sizeof(sparse_row_part) + sizeof(uint16_t) * num_non_zeros;
143  }
144 
145  static size_t get_num_entries(size_t size) {
146  assert((size - sizeof(sparse_row_part)) % sizeof(uint16_t) == 0);
147  return (size - sizeof(sparse_row_part)) / sizeof(uint16_t);
148  }
149 
150  static size_t get_row_id_size() {
151  return sizeof(rel_row_idx);
152  }
153 
154  static size_t get_col_entry_size() {
155  return sizeof(rel_col_idxs[0]);
156  }
157 
158  sparse_row_part(uint16_t rel_row_idx) {
159  this->rel_row_idx = rel_row_idx;
160  // The highest bit indicates that a new row part starts.
161  this->rel_row_idx |= 1 << (num_bits - 1);
162  }
163 
164  size_t get_rel_row_idx() const {
165  static const int mask = (1 << (num_bits - 1)) - 1;
166  return rel_row_idx & mask;
167  }
168 
169  size_t get_rel_col_idx(size_t idx) const {
170  return rel_col_idxs[idx];
171  }
172 
173  rp_edge_iterator get_edge_iterator() {
174  return rp_edge_iterator(get_rel_row_idx(), rel_col_idxs);
175  }
176 
177  rp_edge_iterator get_edge_iterator(const char *data, size_t entry_size) {
178  return rp_edge_iterator(get_rel_row_idx(), rel_col_idxs, data,
179  entry_size);
180  }
181 };
182 
183 /*
184  * This is used to store the rows with a single non-zero entry.
185  */
186 class local_coo_t
187 {
188  static const int num_bits = sizeof(uint16_t) * 8;
189  // A block has at most 2^15 rows and cols.
190  // The most significant bit in the row index is always 1, so this can
191  // be interpreted as a row part as well.
192  uint16_t row_idx;
193  uint16_t col_idx;
194 public:
195  local_coo_t(uint16_t row_idx, uint16_t col_idx) {
196  this->row_idx = row_idx | (1 << (num_bits - 1));
197  this->col_idx = col_idx;
198  }
199 
200  uint16_t get_row_idx() const {
201  static const int mask = (1 << (num_bits - 1)) - 1;
202  return row_idx & mask;
203  }
204 
205  uint16_t get_col_idx() const {
206  return col_idx;
207  }
208 };
209 
210 /*
211  * This stores the block of a sparse matrix partitioned in 2D.
212  * The block is stored in row major.
213  */
214 class sparse_block_2d
215 {
216  uint32_t block_row_idx;
217  uint32_t block_col_idx;
218  // The total number of non-zero entries.
219  uint32_t nnz;
220  // The number of rows with non-zero entries.
221  uint16_t nrow;
222  // The number of rows with a single non-zero entry.
223  uint16_t num_coo_vals;
224  // This is where the row parts are serialized.
225  char row_parts[0];
226 
227  // The 2D-partitioned block has to be allowed in the heap. The copy
228  // constructor doesn't make sense for it.
229  sparse_block_2d(const sparse_block_2d &block) = delete;
230 
231  /*
232  * Row header size including the COO region.
233  */
234  size_t get_rindex_size() const {
235  // The space used by row ids.
236  return nrow * sparse_row_part::get_row_id_size()
237  // The space used by col index entries in the row part.
238  + nnz * sparse_row_part::get_col_entry_size()
239  // The empty row part that indicates the end of row-part region.
240  + sparse_row_part::get_row_id_size();
241  // TODO should I align to the size of non-zero entry type?
242  }
243 
244  /*
245  * The row header size excluding the COO region.
246  */
247  size_t get_rheader_size() const {
248  // The space used by row ids. The empty row part is also included.
249  return (nrow - num_coo_vals) * sparse_row_part::get_row_id_size()
250  // The space used by col index entries in the row part.
251  + (nnz - num_coo_vals) * sparse_row_part::get_col_entry_size()
252  // The empty row part that indicates the end of row-part region.
253  + sparse_row_part::get_row_id_size();
254  }
255 
256  sparse_row_part *get_rpart_end() {
257  return (sparse_row_part *) (row_parts + get_rheader_size()
258  // Let's exclude the last empty row part.
259  - sparse_row_part::get_row_id_size());
260  }
261  const sparse_row_part *get_rpart_end() const {
262  return (sparse_row_part *) (row_parts + get_rheader_size()
263  // Let's exclude the last empty row part.
264  - sparse_row_part::get_row_id_size());
265  }
266 
267  local_coo_t *get_coo_start() {
268  return (local_coo_t *) (row_parts + get_rheader_size());
269  }
270 
271  char *get_nz_data() {
272  return (char *) (row_parts + get_rindex_size());
273  }
274 
275  const char *get_nz_data() const {
276  return (char *) (row_parts + get_rindex_size());
277  }
278 public:
279  sparse_block_2d(uint32_t block_row_idx, uint32_t block_col_idx) {
280  this->block_row_idx = block_row_idx;
281  this->block_col_idx = block_col_idx;
282  nnz = 0;
283  nrow = 0;
284  num_coo_vals = 0;
285  }
286 
287  size_t get_block_row_idx() const {
288  return block_row_idx;
289  }
290 
291  size_t get_block_col_idx() const {
292  return block_col_idx;
293  }
294 
295  bool is_empty() const {
296  return nnz == 0;
297  }
298 
299  size_t get_size(size_t entry_size) const {
300  if (entry_size == 0)
301  return sizeof(*this) + get_rindex_size();
302  else
303  return sizeof(*this) + get_rindex_size() + entry_size * nnz;
304  }
305 
306  /*
307  * This doesn't count the COO entries even though they also follow
308  * the row part format.
309  */
310  bool has_rparts() const {
311  return nnz - num_coo_vals > 0;
312  }
313 
314  rp_edge_iterator get_first_edge_iterator() const {
315  assert(has_rparts());
316  // Discard the const qualifier
317  // TODO I should make a const edge iterator
318  sparse_row_part *rp = (sparse_row_part *) row_parts;
319  return rp->get_edge_iterator();
320  }
321 
322  rp_edge_iterator get_first_edge_iterator(size_t entry_size) const {
323  assert(has_rparts());
324  // Discard the const qualifier
325  // TODO I should make a const edge iterator
326  sparse_row_part *rp = (sparse_row_part *) row_parts;
327  if (entry_size == 0)
328  return rp->get_edge_iterator();
329  else
330  return rp->get_edge_iterator(get_nz_data(), entry_size);
331  }
332 
333  rp_edge_iterator get_next_edge_iterator(const rp_edge_iterator &it) const {
334  assert(!it.has_next());
335  // TODO I should make a const edge iterator
336  sparse_row_part *rp = (sparse_row_part *) it.get_curr_addr();
337  return rp->get_edge_iterator();
338  }
339 
340  rp_edge_iterator get_next_edge_iterator(const rp_edge_iterator &it,
341  size_t entry_size) const {
342  assert(!it.has_next());
343  // TODO I should make a const edge iterator
344  sparse_row_part *rp = (sparse_row_part *) it.get_curr_addr();
345  if (entry_size == 0)
346  return rp->get_edge_iterator();
347  else
348  return rp->get_edge_iterator(it.get_curr_data(), entry_size);
349  }
350 
351  bool is_rparts_end(const rp_edge_iterator &it) const {
352  // If the block doesn't have non-zero entries or there aren't non-zero
353  // entries in the row parts.
354  if (nnz == 0 || nnz - num_coo_vals == 0)
355  return true;
356 
357  return it.get_curr_addr() - sparse_row_part::get_row_id_size()
358  == (const char *) get_rpart_end();
359  }
360 
361  size_t get_num_coo_vals() const {
362  return num_coo_vals;
363  }
364  const local_coo_t *get_coo_start() const {
365  return (local_coo_t *) (row_parts + get_rheader_size());
366  }
367  const char *get_coo_val_start(size_t entry_size) const {
368  return get_nz_data() + (nnz - num_coo_vals) * entry_size;
369  }
370 
371  void append(const sparse_row_part &part, size_t part_size);
372 
373  void add_coo(const std::vector<coo_nz_t> &nnz,
374  const block_2d_size &block_size);
375 
376  /*
377  * Finalize the construction of the block.
378  * We need to add an empty row in the end if there aren't COO entries
379  * in the block, so the edge iterator can notice the end of the last row.
380  * We also need to add the non-zero values.
381  */
382  void finalize(const char *data, size_t num_bytes);
383 
384  /*
385  * Get all non-zero entries in the block.
386  * This is used for testing only.
387  */
388  std::vector<coo_nz_t> get_non_zeros(const block_2d_size &block_size) const;
389 
390  size_t get_nnz() const {
391  return nnz;
392  }
393 
394  void verify(const block_2d_size &block_size) const;
395 };
396 
397 /*
398  * This allows users to iterate the blocks in a block row.
399  */
400 class block_row_iterator
401 {
402  const sparse_block_2d *block;
403  const sparse_block_2d *end;
404 public:
405  /*
406  * `first' is the first block in the block row.
407  * `end' is the end of the last block in the block row.
408  */
409  block_row_iterator(const sparse_block_2d *first, const sparse_block_2d *end) {
410  block = first;
411  this->end = end;
412  }
413 
414  bool has_next() const {
415  return block < end;
416  }
417 
418  const sparse_block_2d &get_curr() const {
419  return *block;
420  }
421 
422  const sparse_block_2d &next(size_t entry_size) {
423  const sparse_block_2d *orig = block;
424  block = (const sparse_block_2d *) (((const char *) block)
425  + block->get_size(entry_size));
426  return *orig;
427  }
428 };
429 
430 /*
431  * This indexes a sparse matrix for easy access to the 2D-partitioned blocks.
432  */
433 class SpM_2d_index
434 {
435  struct deleter {
436  void operator()(SpM_2d_index *p) const{
437  free(p);
438  }
439  };
440 
441  matrix_header header;
442  off_t offs[0];
443 
444  SpM_2d_index(const matrix_header &_header): header(_header) {
445  }
446 
447  /*
448  * The number of offset entries in the index.
449  */
450  size_t get_num_entries() const {
451  return get_num_block_rows() + 1;
452  }
453 public:
454  typedef std::shared_ptr<SpM_2d_index> ptr;
455 
456  /*
457  * Calculate the storage size of the index.
458  */
459  static size_t get_size(size_t num_entries) {
460  return sizeof(SpM_2d_index)
461  + num_entries * sizeof(SpM_2d_index::offs[0]);
462  }
463 
464  static ptr create(const matrix_header &header,
465  const std::vector<off_t> &offs);
466  static ptr load(const std::string &idx_file);
467  static ptr safs_load(const std::string &idx_file);
468 
469  void verify() const;
470  size_t get_num_block_rows() const;
471  void dump(const std::string &file) const;
472  void safs_dump(const std::string &file) const;
473  off_t get_block_row_off(size_t idx) const;
474  const matrix_header &get_header() const {
475  return header;
476  }
477 };
478 
479 class vector_vector;
480 
481 class SpM_2d_storage
482 {
483  struct deleter {
484  void operator()(char *p) const {
485  free(p);
486  }
487  };
488 
489  std::string mat_name;
490  int mat_file_id;
491 
492  safs::NUMA_buffer::ptr data;
493  SpM_2d_index::ptr index;
494 
495  SpM_2d_storage(safs::NUMA_buffer::ptr data,
496  SpM_2d_index::ptr index, const std::string mat_name) {
497  this->data = data;
498  this->index = index;
499  this->mat_name = mat_name;
500  this->mat_file_id = -1;
501  }
502 public:
503  typedef std::shared_ptr<SpM_2d_storage> ptr;
504 
505  static ptr safs_load(const std::string &mat_file,
506  SpM_2d_index::ptr index);
507  static ptr load(const std::string &mat_file,
508  SpM_2d_index::ptr index);
509  static ptr create(const matrix_header &header, const vector_vector &vv,
510  SpM_2d_index::ptr index);
511 
512  static void verify(SpM_2d_index::ptr index, const std::string &mat_file);
513 
514  size_t get_num_block_rows() const {
515  return index->get_num_block_rows();
516  }
517 
518  void verify() const;
519  void dump(const std::string &file) const;
520 
521  std::shared_ptr<safs::file_io_factory> create_io_factory() const;
522 };
523 
524 }
525 
526 #endif
file_io_factory::shared_ptr create_io_factory(const std::string &file_name, const int access_option)