XTL  0.1
eXtended Template Library
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
wordnet.hpp
Go to the documentation of this file.
1 
5 #pragma once
6 
7 #include <xtd/xtd.hpp>
8 
9 #include <map>
10 #include <future>
11 #include <sstream>
12 #include <fstream>
13 
14 #include <xtd/filesystem.hpp>
15 
16 
17 namespace wordnet
18 {
19 
20 
21 
22  struct database {
23 
24  database() :
25  _data_adj(new data_file),
26  _data_adv(new data_file),
27  _data_noun(new data_file),
28  _data_verb(new verb_data_file),
29  _index_adj(new index_file),
30  _index_adv(new index_file),
31  _index_noun(new index_file),
32  _index_verb(new index_file)
33  {
34  auto make_path = [&](const char * sAddend){
35  xtd::filesystem::path oRet(XTD_ASSETS_DIR "/WordNet-3.0/dict");
36  oRet /= sAddend;
37  return oRet;
38  };
39  auto t1 = std::async(std::launch::async, [&]() {
40  return _data_adj->load(make_path("data.adj"));
41  });
42  auto t2 = std::async(std::launch::async, [&]() {
43  return _data_adv->load(make_path("data.adv"));
44  });
45  auto t3 = std::async(std::launch::async, [&]() {
46  return _data_noun->load(make_path("data.noun"));
47  });
48  auto t4 = std::async(std::launch::async, [&]() {
49  return _data_verb->load(make_path("data.verb"));
50  });
51  auto t5 = std::async(std::launch::async, [&]() {
52  return _index_adj->load(make_path("index.adj"));
53  });
54  auto t6 = std::async(std::launch::async, [&]() {
55  return _index_adv->load(make_path("index.adv"));
56  });
57  auto t7 = std::async(std::launch::async, [&]() {
58  return _index_noun->load(make_path("index.noun"));
59  });
60  auto t8 = std::async(std::launch::async, [&]() {
61  return _index_verb->load(make_path("index.verb"));
62  });
63  t1.get();
64  t2.get();
65  t3.get();
66  t4.get();
67  t5.get();
68  t6.get();
69  t7.get();
70  t8.get();
71  }
72  database(const database&) = delete;
73 
74  private:
75 
76  struct file{
77 
78  protected:
79  template <typename _RecordT, typename _ContainerT> bool load(const xtd::filesystem::path& oPath, _ContainerT& oRecords){
80  std::ifstream in(oPath);
81  in.exceptions(std::ios::badbit | std::ios::failbit);
82  xtd::string sFile((std::istreambuf_iterator<char>(in)), (std::istreambuf_iterator<char>()));
83  size_t i = 0;
84  for (; i < sFile.size(); ++i){
85  if (' ' == sFile[i] && ' ' == sFile[1 + i]){
86  for (; '\n' != sFile[i] && i < sFile.size(); ++i);
87  continue;
88  }
89  break;
90  }
91  for (; i < sFile.size(); ++i){
92  _RecordT oRecord;
93  if (!oRecord.load(sFile, i)){
94  return false;
95  }
96  oRecords.insert(std::make_pair(oRecord.synset_offset, oRecord));
97  }
98  return true;
99  }
100  };
101 
102  struct index_file : file{
103 
104  using pointer = std::shared_ptr<index_file>;
105 
106  struct record{
107  enum SyntacticCategory : char{
108  noun = 'n',
109  verb = 'v',
110  adj = 'a',
111  adverb = 'r',
112  };
113  using vector = std::vector<record>;
114  using map = std::map<uint32_t, record>;
115  std::vector<std::string> ptr_symbol;
116 
117  xtd::string lemma;
118  SyntacticCategory pos;
119  uint32_t synset_offset, synset_cnt, sense_cnt, tagsense_cnt;
120 
121  bool load(const xtd::string& sz, size_t & i){
122  std::stringstream oSS;
123  auto x = sz.find('\n', i);
124  xtd::string spos, ssynset_offset, p_cnt, ssynset_cnt, ssense_cnt, stagsense_cnt, sLine(&sz[i], &sz[x]);
125  oSS.str(sLine);
126  oSS >> lemma >> spos >> ssynset_cnt >> p_cnt;
127  pos = static_cast<SyntacticCategory>(spos[0]);
128  synset_cnt = atoi(ssynset_cnt.c_str());
129  for (auto t = atoi(p_cnt.c_str()); t; --t){
130  xtd::string sTemp;
131  oSS >> sTemp;
132  ptr_symbol.push_back(sTemp);
133  }
134  oSS >> ssense_cnt >> stagsense_cnt >> ssynset_offset;
135  sense_cnt = atoi(ssense_cnt.c_str());
136  tagsense_cnt = atoi(stagsense_cnt.c_str());
137  synset_offset = atoi(ssynset_offset.c_str());
138  lemma.replace({ '_' }, ' ');
139  i = x;
140  return true;
141  }
142  };
143 
144  record::map records;
145 
146  bool load(const xtd::filesystem::path& oPath){
147  return file::load<record>(oPath, records);
148  }
149 
150  };
151 
152 
153  struct data_file : file{
154  using pointer = std::shared_ptr<data_file>;
155  enum SynsetType : char{
156  noun = 'n',
157  verb = 'v',
158  adj = 'a',
159  adj_satellite = 's',
160  adverb = 'r',
161  };
162 
163  struct record{
164  using vector = std::vector<record>;
165  using map = std::map<uint32_t, record>;
166 
167  struct word_index{
168  using vector = std::vector<word_index>;
169  xtd::string word, lex_id;
170  word_index(const xtd::string& sword, const xtd::string& slexid) : word(sword), lex_id(slexid){}
171  };
172 
173  struct ptr{
174  enum SyntacticCategory : char{
175  noun = 'n',
176  verb = 'v',
177  adj = 'a',
178  adverb = 'r',
179  };
180  SyntacticCategory pos;
181  xtd::string pointer_symbol, source_target;
182  uint32_t synset_offset;
183  using vector = std::vector<ptr>;
184  ptr(const xtd::string& spointer_symbol, const xtd::string& ssynset_offset, SyntacticCategory spos, const xtd::string& ssource_target)
185  : pointer_symbol(spointer_symbol), synset_offset(atoi(ssynset_offset.c_str())), pos(spos), source_target(ssource_target){}
186  };
187 
188  bool load(const xtd::string& sFile, size_t & i){
189  size_t iEnd = i;
190  for (; '\n' != sFile[iEnd] && iEnd < sFile.size(); ++iEnd);
191  auto oItems = xtd::string(&sFile[i], &sFile[iEnd]).split({ ' ' }, true);
192  xtd::string w_cnt, p_cnt;
193  size_t x = 0;
194  synset_offset = atoi(oItems[x++].c_str());
195  lex_filenum = atoi(oItems[x++].c_str());
196  ss_type = static_cast<SynsetType>(oItems[x++][0]);
197  w_cnt = oItems[x++];
198  for (auto t = atoi(w_cnt.c_str()); t; --t){
199  auto p1 = oItems[x++];
200  auto p2 = oItems[x++];
201  p1.replace({ '_' }, ' ');
202  p2.replace({ '_' }, ' ');
203  words.emplace_back(p1, p1);
204  }
205  p_cnt = oItems[x++];
206  for (auto t = atoi(p_cnt.c_str()); t; --t){
207  auto p1 = oItems[x++];
208  auto p2 = oItems[x++];
209  auto p3 = static_cast<ptr::SyntacticCategory>(oItems[x++][0]);
210  auto p4 = oItems[x++];
211  pointers.emplace_back(p1, p2, p3, p4);
212  }
213  for (; '|' != sFile[i] && i < iEnd; ++i);
214  gloss = xtd::string(&sFile[i], &sFile[iEnd]);
215  i = ++iEnd;
216  return true;
217  }
218 
219  uint32_t synset_offset, lex_filenum;
220  SynsetType ss_type;
221  xtd::string gloss;
222  word_index::vector words;
223  ptr::vector pointers;
224  };
225 
226 
227  bool load(const xtd::filesystem::path& oPath){
228  return file::load<record>(oPath, records);
229  }
230 
231 
232  record::map records;
233 
234  };
235 
236 
237  struct verb_data_file : data_file{
238 
239  using pointer = std::shared_ptr<verb_data_file>;
240 
241  struct record : data_file::record{
242 
243  struct generic_frame{
244  using vector = std::vector<generic_frame>;
245  xtd::string plus, f_num, w_num;
246  generic_frame(const xtd::string& splus, const xtd::string& sf_num, const xtd::string& sw_num) : plus(splus), f_num(sf_num), w_num(sw_num){}
247  };
248 
249  bool load(const xtd::string& sFile, size_t & i){
250  size_t iEnd = i;
251  for (; '\n' != sFile[iEnd] && iEnd < sFile.size(); ++iEnd);
252  auto oItems = xtd::string(&sFile[i], &sFile[iEnd]).split({ ' ' }, true);
253  xtd::string w_cnt, p_cnt;
254  size_t x = 0;
255  synset_offset = atoi(oItems[x++].c_str());
256  lex_filenum = atoi(oItems[x++].c_str());
257  ss_type = static_cast<SynsetType>(oItems[x++][0]);
258  w_cnt = oItems[x++];
259  for (auto t = atoi(w_cnt.c_str()); t; --t){
260  auto p1 = oItems[x++];
261  auto p2 = oItems[x++];
262  p1.replace({ '_' }, ' ');
263  p2.replace({ '_' }, ' ');
264  words.emplace_back(p1, p2);
265  }
266  p_cnt = oItems[x++];
267  for (auto t = atoi(p_cnt.c_str()); t; --t){
268  auto p1 = oItems[x++];
269  auto p2 = oItems[x++];
270  auto p3 = static_cast<ptr::SyntacticCategory>(oItems[x++][0]);
271  auto p4 = oItems[x++];
272  pointers.emplace_back(p1, p2, p3, p4);
273  }
274  f_cnt = oItems[x++];
275  for (auto t = atoi(f_cnt.c_str()); t; --t){
276  auto p1 = oItems[x++];
277  auto p2 = oItems[x++];
278  auto p3 = oItems[x++];
279  generic_frames.emplace_back(p1, p2, p3);
280  }
281  for (; '|' != sFile[i] && i < iEnd; ++i);
282  gloss = xtd::string(&sFile[i], &sFile[iEnd]);
283  i = ++iEnd;
284  return true;
285  }
286 
287  xtd::string f_cnt;
288  generic_frame::vector generic_frames;
289 
290  };
291 
292  bool load(const xtd::filesystem::path& oPath){
293  return file::load<record>(oPath, records);
294  }
295 
296  record::map records;
297 
298  };
299 
300  data_file::pointer _data_adj;
301  data_file::pointer _data_adv;
302  data_file::pointer _data_noun;
303  verb_data_file::pointer _data_verb;
304  index_file::pointer _index_adj;
305  index_file::pointer _index_adv;
306  index_file::pointer _index_noun;
307  index_file::pointer _index_verb;
308 
309 
310  };
311 }
std::vector< xstring< _ChT > > split(const std::initializer_list< _ChT > &delimiters, bool trimEmpty=false) const
splits the string by the specified delmiters into constituent elements
Definition: string.hpp:165
host, target and build configurations and settings Various components are purpose built for specific ...
handle necessary filesystem and path functionality until C++17 is finalized