Vcsn  2.8
Be Rational
efsm.cc
Go to the documentation of this file.
1 #include <fstream>
2 #include <string>
3 
4 #include <vcsn/config.hh>
5 
6 #include <boost/algorithm/string/erase.hpp>
7 #include <boost/algorithm/string/predicate.hpp> // starts_with
8 #include <boost/algorithm/string/replace.hpp> // replace_all_copy
9 #include <boost/iostreams/filter/bzip2.hpp>
10 #if VCSN_HAVE_BOOST_IOSTREAMS_FILTER_LZMA_HPP
11 # include <boost/iostreams/filter/lzma.hpp>
12 #endif
13 #include <boost/iostreams/filtering_stream.hpp>
14 
15 #include <lib/vcsn/algos/fwd.hh>
17 #include <vcsn/dyn/algos.hh>
18 #include <vcsn/dyn/automaton.hh>
19 #include <vcsn/misc/getargs.hh>
20 #include <vcsn/misc/symbol.hh>
21 #include <vcsn/misc/regex.hh>
22 
23 namespace vcsn
24 {
25  namespace dyn
26  {
27  namespace
28  {
29  template <typename... Args>
30  ATTRIBUTE_NORETURN
31  void
32  raise_invalid(const location& loc, Args&&... args)
33  {
34  const auto& file
35  = loc.begin.filename ? *loc.begin.filename : "file.efsm";
36  raise(file, ": invalid efsm file: ",
37  std::forward<Args>(args)...);
38  }
39 
42  std::string
43  read_here_doc(std::istream& is, const location& loc)
44  {
45  static const auto re
46  = std::regex("cat >\\$medir/([a-z]+)\\.[a-z]* <<\\\\EOFSM",
47  std::regex::extended);
48  std::string line;
49  std::smatch res;
50  while (is.good())
51  {
52  std::getline(is, line, '\n');
53  if (std::regex_match(line, res, re))
54  return res[1];
55  }
56  raise_invalid(loc, "missing \"cat\" symbol");
57  }
58 
62  std::string
63  read_symbol_table(std::istream& is, const location& loc)
64  {
65  std::string res;
66  std::string line;
67  std::string val;
68  while (is.good())
69  {
70  std::getline(is, line, '\n');
71  std::istringstream ss{line};
72  ss >> res;
73  if (ss.fail())
74  continue;
75  ss >> val;
76  if (ss.fail())
77  raise_invalid(loc);
78  if (val == "0" || res == "EOFSM")
79  break;
80  }
81 
82  while (line != "EOFSM" && is.good())
83  std::getline(is, line, '\n');
84 
85  if (line != "EOFSM")
86  raise_invalid(loc, "missing closing EOFSM");
87  return res;
88  }
89 
92  read_weightset_type(std::istream& is, const location& loc)
93  {
94  using weightset_type = lazy_automaton_editor::weightset_type;
95  std::string line;
96  while (is.good())
97  {
98  std::getline(is, line, '\n');
99  if (boost::starts_with(line, "arc_type="))
100  {
101  boost::algorithm::erase_first(line, "arc_type=");
102  static auto map = getarg<weightset_type>
103  {
104  "arc type",
105  {
106  {"log", weightset_type::logarithmic},
107  {"log64", weightset_type::logarithmic},
108  {"standard", weightset_type::tropical},
109  }
110  };
111  return map[line];
112  }
113  }
114  raise_invalid(loc, "missing \"arc_type=\"");
115  }
116  }
117 
118  automaton
119  read_efsm(std::istream& is, const location& loc)
120  {
121  using string_t = symbol;
122 
123  // Whether has both isysmbols and osymbols.
124  bool is_transducer = false;
125 
126  // Look for the arc type, which describes the weightset.
127  auto weightset = read_weightset_type(is, loc);
128 
129  // Look for the symbol table.
130  auto isyms = read_here_doc(is, loc);
131  // The single piece of information we need from the symbol
132  // table: the representation of the empty word.
133  std::string ione = read_symbol_table(is, loc);
134 
135  // If we had "isymbols", we now expect "osymbols".
136  std::string oone = ione;
137  if (isyms == "isymbols")
138  {
139  is_transducer = true;
140  auto osyms = read_here_doc(is, loc);
141  if (osyms != "osymbols")
142  raise_invalid(loc, "expected osymbols: ", osyms);
143  oone = read_symbol_table(is, loc);
144  }
145 
146  auto edit = vcsn::lazy_automaton_editor{};
147  edit.open(true);
148  edit.weightset(weightset);
149 
150  // The first transition also provides the initial state.
151  bool first = true;
152  auto trans = read_here_doc(is, loc);
153  if (trans != "transitions")
154  raise_invalid(loc, "expected transitions: ", trans);
155  // Line: Source Dest ILabel [OLabel] [Weight].
156  // Line: FinalState [Weight].
157  std::string line;
158  while (is.good())
159  {
160  std::getline(is, line, '\n');
161  if (line == "EOFSM")
162  break;
163  std::istringstream ss{line};
164  string_t s, d, l1, l2, w;
165  ss >> s >> d >> l1 >> l2 >> w;
166  if (first)
167  edit.add_initial(s);
168  if (l1.get().empty())
169  // FinalState [Weight]
170  edit.add_final(s, d);
171  else
172  {
173  if (l1 == ione)
174  l1 = "\\e";
175  if (is_transducer)
176  {
177  if (l2 == oone)
178  l2 = "\\e";
179  edit.add_transition(s, d, l1, l2, w);
180  }
181  else
182  {
183  // l2 is actually the weight.
184  edit.add_transition(s, d, l1, l2);
185  }
186  }
187  first = false;
188  }
189 
190  if (line != "EOFSM")
191  raise_invalid(loc, "missing EOFSM");
192  // Flush till EOF.
193  while (is.get() != EOF)
194  continue;
195 
196  // We don't want to read it as a `law<char>` automaton, as for
197  // OpenFST, these "words" are insecable. The proper
198  // interpretation is lal<string> (or lan<string>).
199  using boost::algorithm::replace_all_copy;
200  auto ctx = replace_all_copy(edit.result_context(),
201  "law<char>", "lan<string>");
202  return edit.result(ctx);
203  }
204 
205  automaton
206  read_efsm_bzip2(std::istream& is, const location& loc)
207  {
208  namespace io = boost::iostreams;
209  auto&& in =io::filtering_stream<io::input>{};
210  in.push(io::bzip2_decompressor());
211  in.push(is);
212  return read_efsm(in, loc);
213  }
214 
215  automaton
216  read_efsm_lzma(std::istream& is, const location& loc)
217  {
218 #if VCSN_HAVE_BOOST_IOSTREAMS_FILTER_LZMA_HPP
219  namespace io = boost::iostreams;
220  auto&& in =io::filtering_stream<io::input>{};
221  in.push(io::lzma_decompressor());
222  in.push(is);
223  return read_efsm(in, loc);
224 #else
225  raise("Boost is too old (", BOOST_LIB_VERSION, ") to support lzma");
226 #endif
227  }
228  }
229 }
A dyn automaton.
Definition: automaton.hh:17
automaton read_efsm_bzip2(std::istream &is, const location &loc)
Definition: efsm.cc:206
boost::flyweight< std::string, boost::flyweights::no_tracking, boost::flyweights::intermodule_holder > symbol
An internalized string.
Definition: symbol.hh:21
symbol string_t
Definition: parse.hh:66
Abstract a location.
Definition: location.hh:47
vcsn::rat::location location
Pairs of positions in a file/stream.
Definition: fwd.hh:36
automaton read_efsm(std::istream &is, const location &loc)
Definition: efsm.cc:119
Definition: a-star.hh:8
auto in(const Aut &aut, state_t_of< Aut > s)
Indexes of visible transitions arriving to state s.
Definition: automaton.hh:135
Build an automaton with unknown context.
bool open(bool o)
Whether unknown letters should be added, or rejected.
automaton read_efsm_lzma(std::istream &is, const location &loc)
Definition: efsm.cc:216
weightset_type
Weightset types.
auto map(const std::tuple< Ts... > &ts, Fun f) -> decltype(map_tuple_(f, ts, make_index_sequence< sizeof...(Ts)>()))
Map a function on a tuple, return tuple of the results.
Definition: tuple.hh:223
return res
Definition: multiply.hh:399