FlashGraph-ng
A new frontier in large-scale graph analysis and data mining
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Pages
sem_kmeans.h
1 /*
2  * Copyright 2015 Open Connectome Project (http://openconnecto.me)
3  * Written by Disa Mhembere (disa@jhu.edu)
4  *
5  * This file is part of FlashGraph.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY CURRENT_KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 #ifndef __SEM_KMEANS_H__
20 #define __SEM_KMEANS_H__
21 
22 #include <math.h>
23 
24 #include <vector>
25 #include <algorithm>
26 #include <map>
27 
28 #include "graph_engine.h"
29 #include "graph_config.h"
30 #include "FGlib.h"
31 #include "save_result.h"
32 
33 using namespace fg;
34 
35 namespace {
37  class cluster
38  {
39  private:
40  std::vector<double> mean; // cluster mean
41  unsigned num_members; // cluster assignment count
42  bool complete; // Have we already divided by num_members
43 
44  void div(const unsigned val) {
45  if (num_members > 0) {
46  for (unsigned i = 0; i < mean.size(); i++) {
47  mean[i] /= double(val);
48  }
49  }
50  complete = true;
51  }
52 
53  cluster(const unsigned len) {
54  mean.assign(len, 0);
55  num_members = 0;
56  complete = false;
57  }
58 
59  cluster(const std::vector<double>& mean) {
60  set_mean(mean);
61  num_members = 1;
62  complete = true;
63  }
64 
65  public:
66  typedef typename std::shared_ptr<cluster> ptr;
67 
68  static ptr create(const unsigned len) {
69  return ptr(new cluster(len));
70  }
71 
72  static ptr create(const std::vector<double>& mean) {
73  return ptr(new cluster(mean));
74  }
75 
76  void init(const unsigned len) {
77  mean.assign(len, 0);
78  num_members = 0;
79  }
80 
81  void assign(const double val) {
82  mean.assign(mean.size(), val);
83  }
84 
85  void clear() {
86  this->assign(0);
87  this->num_members = 0;
88  complete = false;
89  }
90 
91  const std::vector<double>& get_mean() const {
92  return mean;
93  }
94 
95  void set_mean(const std::vector<double>& mean) {
96  this->mean = mean;
97  }
98 
99  const unsigned get_num_members() const {
100  return num_members;
101  }
102 
103  const bool is_complete() const {
104  return complete;
105  }
106 
107  const unsigned size() const {
108  return mean.size();
109  }
110 
111  void finalize() {
112  assert(!complete);
113  this->div(this->num_members);
114  }
115 
116  void add_member(edge_seq_iterator& id_it, data_seq_iterator& count_it) {
117  while(id_it.has_next()) {
118  vertex_id_t nid = id_it.next();
119  edge_count e = count_it.next();
120  mean[nid] += e.get_count();
121  }
122  num_members++;
123  }
124 
125  cluster& operator=(const cluster& other) {
126  this->mean = other.get_mean();
127  this->num_members = other.get_num_members();
128  return *this;
129  }
130 
131  double& operator[](const unsigned index) {
132  assert(index < mean.size());
133  return mean[index];
134  }
135 
136  cluster& operator+=(cluster& rhs) {
137  assert(rhs.size() == size());
138  // TODO vectorize perhaps
139  for (unsigned i = 0; i < mean.size(); i++) {
140  this->mean[i] += rhs[i];
141  }
142  this->num_members += rhs.get_num_members();
143  return *this;
144  }
145  };
146 
147 }
148 
149 namespace fg
150 {
151  // A class used a return object for R bindings
152  class sem_kmeans_ret
153  {
154  private:
155  FG_vector<unsigned>::ptr cluster_assignments;
156  std::vector<std::vector<double>> centers;
157  std::vector<unsigned> size;
158  unsigned iters;
159 
160  sem_kmeans_ret(const FG_vector<unsigned>::ptr cluster_assignments,
161  const std::vector<std::vector<double>> centers,
162  const std::vector<unsigned>& size, const unsigned iters) {
163  this->cluster_assignments = cluster_assignments;
164  this->centers = centers;
165  this->size = size;
166  this->iters = iters;
167  }
168 
169  public:
170  typedef typename std::shared_ptr<sem_kmeans_ret> ptr;
171 
172  static ptr create(const FG_vector<unsigned>::ptr cluster_assignments,
173  const std::vector<std::vector<double>> centers,
174  const std::vector<unsigned>& size, const unsigned iters) {
175  return ptr(new sem_kmeans_ret(cluster_assignments, centers, size, iters));
176  }
177 
178  const FG_vector<unsigned>::ptr get_cluster_assignments() const {
179  return this->cluster_assignments;
180  }
181 
182  const unsigned get_iters() const {
183  return this->iters;
184  }
185 
186  const std::vector<unsigned>& get_size() const {
187  return this->size;
188  }
189 
190  const std::vector<std::vector<double>>& get_centers() const {
191  return this->centers;
192  }
193  };
194 
203  sem_kmeans_ret::ptr compute_sem_kmeans(FG_graph::ptr fg, const size_t k, const std::string init,
204  const unsigned max_iters, const double tolerance);
205 }
206 #endif
bool has_next()
Definition: cache.h:805
FlashGraph vector that provides several parallelized methods when compared to an STL-vector. NOTE: Not an STL-compatible data structure. This vector is also ideally used with numeric data types. Methods marked with the keyword parallel are parallelized implementations.
Definition: FG_vector.h:41
Definition: vertex.h:336
sem_kmeans_ret::ptr compute_sem_kmeans(FG_graph::ptr fg, const size_t k, const std::string init, const unsigned max_iters, const double tolerance)
Compute Semi-External Memory kmeans.