ゴミ箱
utf8_checker.hpp
Go to the documentation of this file.
1 //
2 // Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 //
7 // Official repository: https://github.com/boostorg/beast
8 //
9 
10 #ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
11 #define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
12 
14 #include <boost/asio/buffer.hpp>
15 #include <boost/assert.hpp>
16 #include <algorithm>
17 #include <cstdint>
18 
19 namespace boost {
20 namespace beast {
21 namespace websocket {
22 namespace detail {
23 
30 template<class = void>
32 {
33  std::size_t need_ = 0; // chars we need to finish the code point
34  std::uint8_t* p_ = cp_; // current position in temp buffer
35  std::uint8_t cp_[4]; // a temp buffer for the code point
36 
37 public:
40  void
41  reset();
42 
45  bool
46  finish();
47 
52  bool
53  write(std::uint8_t const* in, std::size_t size);
54 
59  template<class ConstBufferSequence>
60  bool
61  write(ConstBufferSequence const& bs);
62 };
63 
64 template<class _>
65 void
68 {
69  need_ = 0;
70  p_ = cp_;
71 }
72 
73 template<class _>
74 bool
77 {
78  auto const success = need_ == 0;
79  reset();
80  return success;
81 }
82 
83 template<class _>
84 template<class ConstBufferSequence>
85 bool
88 {
90  "ConstBufferSequence requirements not met");
91  using boost::asio::buffer_cast;
92  using boost::asio::buffer_size;
93  for(boost::asio::const_buffer b : bs)
94  if(! write(buffer_cast<std::uint8_t const*>(b),
95  buffer_size(b)))
96  return false;
97  return true;
98 }
99 
100 template<class _>
101 bool
103 write(std::uint8_t const* in, std::size_t size)
104 {
105  auto const valid =
106  [](std::uint8_t const*& p)
107  {
108  if(p[0] < 128)
109  {
110  ++p;
111  return true;
112  }
113  if((p[0] & 0x60) == 0x40)
114  {
115  if((p[1] & 0xc0) != 0x80)
116  return false;
117  p += 2;
118  return true;
119  }
120  if((p[0] & 0xf0) == 0xe0)
121  {
122  if((p[1] & 0xc0) != 0x80 ||
123  (p[2] & 0xc0) != 0x80 ||
124  (p[0] == 224 && p[1] < 160) ||
125  (p[0] == 237 && p[1] > 159))
126  return false;
127  p += 3;
128  return true;
129  }
130  if((p[0] & 0xf8) == 0xf0)
131  {
132  if(p[0] > 244 ||
133  (p[1] & 0xc0) != 0x80 ||
134  (p[2] & 0xc0) != 0x80 ||
135  (p[3] & 0xc0) != 0x80 ||
136  (p[0] == 240 && p[1] < 144) ||
137  (p[0] == 244 && p[1] > 143))
138  return false;
139  p += 4;
140  return true;
141  }
142  return false;
143  };
144  auto const valid_have =
145  [&]()
146  {
147  if((cp_[0] & 0x60) == 0x40)
148  return cp_[0] <= 223;
149  if((cp_[0] & 0xf0) == 0xe0)
150  {
151  if(p_ - cp_ > 1 &&
152  ((cp_[1] & 0xc0) != 0x80 ||
153  (cp_[0] == 224 && cp_[1] < 160) ||
154  (cp_[0] == 237 && cp_[1] > 159)))
155  return false;
156  return true;
157  }
158  if((cp_[0] & 0xf8) == 0xf0)
159  {
160  auto const n = p_ - cp_;
161  if(n > 2 && (cp_[2] & 0xc0) != 0x80)
162  return false;
163  if(n > 1 &&
164  ((cp_[1] & 0xc0) != 0x80 ||
165  (cp_[0] == 240 && cp_[1] < 144) ||
166  (cp_[0] == 244 && cp_[1] > 143)))
167  return false;
168  }
169  return true;
170  };
171  auto const needed =
172  [](std::uint8_t const v)
173  {
174  if(v < 128)
175  return 1;
176  if(v < 192)
177  return 0;
178  if(v < 224)
179  return 2;
180  if(v < 240)
181  return 3;
182  if(v < 248)
183  return 4;
184  return 0;
185  };
186 
187  auto const end = in + size;
188 
189  // Finish up any incomplete code point
190  if(need_ > 0)
191  {
192  // Calculate what we have
193  auto n = (std::min)(size, need_);
194  size -= n;
195  need_ -= n;
196 
197  // Add characters to the code point
198  while(n--)
199  *p_++ = *in++;
200  BOOST_ASSERT(p_ <= cp_ + 5);
201 
202  // Still incomplete?
203  if(need_ > 0)
204  {
205  // Incomplete code point
206  BOOST_ASSERT(in == end);
207 
208  // Do partial validation on the incomplete
209  // code point, this is called "Fail fast"
210  // in Autobahn|Testsuite parlance.
211  return valid_have();
212  }
213 
214  // Complete code point, validate it
215  std::uint8_t const* p = &cp_[0];
216  if(! valid(p))
217  return false;
218  p_ = cp_;
219  }
220 
221  if(size <= sizeof(std::size_t))
222  goto slow;
223 
224  // Align `in` to sizeof(std::size_t) boundary
225  {
226  auto const in0 = in;
227  auto last = reinterpret_cast<std::uint8_t const*>(
228  ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
229  sizeof(std::size_t)) * sizeof(std::size_t));
230 
231  // Check one character at a time for low-ASCII
232  while(in < last)
233  {
234  if(*in & 0x80)
235  {
236  // Not low-ASCII so switch to slow loop
237  size = size - (in - in0);
238  goto slow;
239  }
240  ++in;
241  }
242  size = size - (in - in0);
243  }
244 
245  // Fast loop: Process 4 or 8 low-ASCII characters at a time
246  {
247  auto const in0 = in;
248  auto last = in + size - 7;
249  auto constexpr mask = static_cast<
250  std::size_t>(0x8080808080808080 & ~std::size_t{0});
251  while(in < last)
252  {
253 #if 0
254  std::size_t temp;
255  std::memcpy(&temp, in, sizeof(temp));
256  if((temp & mask) != 0)
257 #else
258  // Technically UB but works on all known platforms
259  if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
260 #endif
261  {
262  size = size - (in - in0);
263  goto slow;
264  }
265  in += sizeof(std::size_t);
266  }
267  // There's at least one more full code point left
268  last += 4;
269  while(in < last)
270  if(! valid(in))
271  return false;
272  goto tail;
273  }
274 
275 slow:
276  // Slow loop: Full validation on one code point at a time
277  {
278  auto last = in + size - 3;
279  while(in < last)
280  if(! valid(in))
281  return false;
282  }
283 
284 tail:
285  // Handle the remaining bytes. The last
286  // characters could split a code point so
287  // we save the partial code point for later.
288  //
289  // On entry to the loop, `in` points to the
290  // beginning of a code point.
291  //
292  for(;;)
293  {
294  // Number of chars left
295  auto n = end - in;
296  if(! n)
297  break;
298 
299  // Chars we need to finish this code point
300  auto const need = needed(*in);
301  if(need == 0)
302  return false;
303  if(need <= n)
304  {
305  // Check a whole code point
306  if(! valid(in))
307  return false;
308  }
309  else
310  {
311  // Calculate how many chars we need
312  // to finish this partial code point
313  need_ = need - n;
314 
315  // Save the partial code point
316  while(n--)
317  *p_++ = *in++;
318  BOOST_ASSERT(in == end);
319  BOOST_ASSERT(p_ <= cp_ + 5);
320 
321  // Do partial validation on the incomplete
322  // code point, this is called "Fail fast"
323  // in Autobahn|Testsuite parlance.
324  return valid_have();
325  }
326  }
327  return true;
328 }
329 
331 
332 template<class = void>
333 bool
334 check_utf8(char const* p, std::size_t n)
335 {
336  utf8_checker c;
337  if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
338  return false;
339  return c.finish();
340 }
341 
342 } // detail
343 } // websocket
344 } // beast
345 } // boost
346 
347 #endif
BufferSequence< boost::asio::const_buffer > ConstBufferSequence
Definition: type_traits.hpp:280
Definition: async_result.hpp:20
void reset()
Definition: utf8_checker.hpp:67
bool finish()
Definition: utf8_checker.hpp:76
STL namespace.
bool write(std::uint8_t const *in, std::size_t size)
Definition: utf8_checker.hpp:103
Definition: type_traits.hpp:59
bool check_utf8(char const *p, std::size_t n)
Definition: utf8_checker.hpp:334