libstdc++
codecvt_specializations.h
Go to the documentation of this file.
1 // Locale support (codecvt) -*- C++ -*-
2 
3 // Copyright (C) 2000-2024 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 //
26 // ISO C++ 14882: 22.2.1.5 Template class codecvt
27 //
28 
29 // Written by Benjamin Kosnik <bkoz@redhat.com>
30 
31 /** @file ext/codecvt_specializations.h
32  * This file is a GNU extension to the Standard C++ Library.
33  */
34 
35 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
36 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
37 
38 #include <bits/requires_hosted.h> // GNU extensions are currently omitted
39 
40 #include <bits/c++config.h>
41 #include <locale>
42 #include <iconv.h>
43 
44 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
45 {
46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
47 _GLIBCXX_BEGIN_NAMESPACE_CXX11
48 
49  /// Extension to use iconv for dealing with character encodings.
50  // This includes conversions and comparisons between various character
51  // sets. This object encapsulates data that may need to be shared between
52  // char_traits, codecvt and ctype.
54  {
55  public:
56  // Types:
57  // NB: A conversion descriptor subsumes and enhances the
58  // functionality of a simple state type such as mbstate_t.
59  typedef iconv_t descriptor_type;
60 
61  protected:
62  // Name of internal character set encoding.
63  std::string _M_int_enc;
64 
65  // Name of external character set encoding.
66  std::string _M_ext_enc;
67 
68  // Conversion descriptor between external encoding to internal encoding.
69  descriptor_type _M_in_desc;
70 
71  // Conversion descriptor between internal encoding to external encoding.
72  descriptor_type _M_out_desc;
73 
74  // The byte-order marker for the external encoding, if necessary.
75  int _M_ext_bom;
76 
77  // The byte-order marker for the internal encoding, if necessary.
78  int _M_int_bom;
79 
80  // Number of external bytes needed to construct one complete
81  // character in the internal encoding.
82  // NB: -1 indicates variable, or stateful, encodings.
83  int _M_bytes;
84 
85  public:
86  explicit
88  : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
89  { }
90 
91  explicit
92  encoding_state(const char* __int, const char* __ext,
93  int __ibom = 0, int __ebom = 0, int __bytes = 1)
94  : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
95  _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
96  { init(); }
97 
98  // 21.1.2 traits typedefs
99  // p4
100  // typedef STATE_T state_type
101  // requires: state_type shall meet the requirements of
102  // CopyConstructible types (20.1.3)
103  // NB: This does not preserve the actual state of the conversion
104  // descriptor member, but it does duplicate the encoding
105  // information.
106  encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
107  { construct(__obj); }
108 
109  // Need assignment operator as well.
111  operator=(const encoding_state& __obj)
112  {
113  construct(__obj);
114  return *this;
115  }
116 
117  ~encoding_state()
118  { destroy(); }
119 
120  bool
121  good() const throw()
122  {
123  const descriptor_type __err = (iconv_t)(-1);
124  bool __test = _M_in_desc && _M_in_desc != __err;
125  __test &= _M_out_desc && _M_out_desc != __err;
126  return __test;
127  }
128 
129  int
130  character_ratio() const
131  { return _M_bytes; }
132 
133  const std::string
134  internal_encoding() const
135  { return _M_int_enc; }
136 
137  int
138  internal_bom() const
139  { return _M_int_bom; }
140 
141  const std::string
142  external_encoding() const
143  { return _M_ext_enc; }
144 
145  int
146  external_bom() const
147  { return _M_ext_bom; }
148 
149  const descriptor_type&
150  in_descriptor() const
151  { return _M_in_desc; }
152 
153  const descriptor_type&
154  out_descriptor() const
155  { return _M_out_desc; }
156 
157  protected:
158  void
159  init()
160  {
161  const descriptor_type __err = (iconv_t)(-1);
162  const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
163  if (!_M_in_desc && __have_encodings)
164  {
165  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
166  if (_M_in_desc == __err)
167  std::__throw_runtime_error(__N("encoding_state::_M_init "
168  "creating iconv input descriptor failed"));
169  }
170  if (!_M_out_desc && __have_encodings)
171  {
172  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
173  if (_M_out_desc == __err)
174  std::__throw_runtime_error(__N("encoding_state::_M_init "
175  "creating iconv output descriptor failed"));
176  }
177  }
178 
179  void
180  construct(const encoding_state& __obj)
181  {
182  destroy();
183  _M_int_enc = __obj._M_int_enc;
184  _M_ext_enc = __obj._M_ext_enc;
185  _M_ext_bom = __obj._M_ext_bom;
186  _M_int_bom = __obj._M_int_bom;
187  _M_bytes = __obj._M_bytes;
188  init();
189  }
190 
191  void
192  destroy() throw()
193  {
194  const descriptor_type __err = (iconv_t)(-1);
195  if (_M_in_desc && _M_in_desc != __err)
196  {
197  iconv_close(_M_in_desc);
198  _M_in_desc = 0;
199  }
200  if (_M_out_desc && _M_out_desc != __err)
201  {
202  iconv_close(_M_out_desc);
203  _M_out_desc = 0;
204  }
205  }
206  };
207 
208  /// encoding_char_traits
209  // Custom traits type with encoding_state for the state type, and the
210  // associated fpos<encoding_state> for the position type, all other
211  // bits equivalent to the required char_traits instantiations.
212  template<typename _CharT>
214  : public std::char_traits<_CharT>
215  {
216  typedef encoding_state state_type;
217  typedef typename std::fpos<state_type> pos_type;
218  };
219 
220 _GLIBCXX_END_NAMESPACE_CXX11
221 _GLIBCXX_END_NAMESPACE_VERSION
222 } // namespace
223 
224 
225 namespace std _GLIBCXX_VISIBILITY(default)
226 {
227 _GLIBCXX_BEGIN_NAMESPACE_VERSION
228 
230 
231  /// codecvt<InternT, _ExternT, encoding_state> specialization.
232  // This partial specialization takes advantage of iconv to provide
233  // code conversions between a large number of character encodings.
234  template<typename _InternT, typename _ExternT>
235  class codecvt<_InternT, _ExternT, encoding_state>
236  : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
237  {
238  public:
239  // Types:
240  typedef codecvt_base::result result;
241  typedef _InternT intern_type;
242  typedef _ExternT extern_type;
244  typedef state_type::descriptor_type descriptor_type;
245 
246  // Data Members:
247  static locale::id id;
248 
249  explicit
250  codecvt(size_t __refs = 0)
252  { }
253 
254  explicit
255  codecvt(state_type& __enc, size_t __refs = 0)
257  { }
258 
259  protected:
260  virtual
261  ~codecvt() { }
262 
263  virtual result
264  do_out(state_type& __state, const intern_type* __from,
265  const intern_type* __from_end, const intern_type*& __from_next,
266  extern_type* __to, extern_type* __to_end,
267  extern_type*& __to_next) const;
268 
269  virtual result
270  do_unshift(state_type& __state, extern_type* __to,
271  extern_type* __to_end, extern_type*& __to_next) const;
272 
273  virtual result
274  do_in(state_type& __state, const extern_type* __from,
275  const extern_type* __from_end, const extern_type*& __from_next,
276  intern_type* __to, intern_type* __to_end,
277  intern_type*& __to_next) const;
278 
279  virtual int
280  do_encoding() const throw();
281 
282  virtual bool
283  do_always_noconv() const throw();
284 
285  virtual int
286  do_length(state_type&, const extern_type* __from,
287  const extern_type* __end, size_t __max) const;
288 
289  virtual int
290  do_max_length() const throw();
291  };
292 
293  template<typename _InternT, typename _ExternT>
294  locale::id
296 
297  // This adaptor works around the signature problems of the second
298  // argument to iconv(): SUSv2 and others use 'const char**', but glibc 2.2
299  // uses 'char**', which matches the POSIX 1003.1-2001 standard.
300  // Using this adaptor, g++ will do the work for us.
301  template<typename _Tp>
302  inline size_t
303  __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
304  iconv_t __cd, char** __inbuf, size_t* __inbytes,
305  char** __outbuf, size_t* __outbytes)
306  { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
307 
308  template<typename _InternT, typename _ExternT>
309  codecvt_base::result
311  do_out(state_type& __state, const intern_type* __from,
312  const intern_type* __from_end, const intern_type*& __from_next,
313  extern_type* __to, extern_type* __to_end,
314  extern_type*& __to_next) const
315  {
316  result __ret = codecvt_base::error;
317  if (__state.good())
318  {
319  const descriptor_type& __desc = __state.out_descriptor();
320  const size_t __fmultiple = sizeof(intern_type);
321  size_t __fbytes = __fmultiple * (__from_end - __from);
322  const size_t __tmultiple = sizeof(extern_type);
323  size_t __tbytes = __tmultiple * (__to_end - __to);
324 
325  // Argument list for iconv specifies a byte sequence. Thus,
326  // all to/from arrays must be brutally casted to char*.
327  char* __cto = reinterpret_cast<char*>(__to);
328  char* __cfrom;
329  size_t __conv;
330 
331  // Some encodings need a byte order marker as the first item
332  // in the byte stream, to designate endian-ness. The default
333  // value for the byte order marker is NULL, so if this is
334  // the case, it's not necessary and we can just go on our
335  // merry way.
336  int __int_bom = __state.internal_bom();
337  if (__int_bom)
338  {
339  size_t __size = __from_end - __from;
340  intern_type* __cfixed = static_cast<intern_type*>
341  (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
342  __cfixed[0] = static_cast<intern_type>(__int_bom);
343  char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
344  __cfrom = reinterpret_cast<char*>(__cfixed);
345  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
346  &__fbytes, &__cto, &__tbytes);
347  }
348  else
349  {
350  intern_type* __cfixed = const_cast<intern_type*>(__from);
351  __cfrom = reinterpret_cast<char*>(__cfixed);
352  __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
353  &__cto, &__tbytes);
354  }
355 
356  if (__conv != size_t(-1))
357  {
358  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
359  __to_next = reinterpret_cast<extern_type*>(__cto);
360  __ret = codecvt_base::ok;
361  }
362  else
363  {
364  if (__fbytes < __fmultiple * (__from_end - __from))
365  {
366  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
367  __to_next = reinterpret_cast<extern_type*>(__cto);
368  __ret = codecvt_base::partial;
369  }
370  else
371  __ret = codecvt_base::error;
372  }
373  }
374  return __ret;
375  }
376 
377  template<typename _InternT, typename _ExternT>
378  codecvt_base::result
380  do_unshift(state_type& __state, extern_type* __to,
381  extern_type* __to_end, extern_type*& __to_next) const
382  {
383  result __ret = codecvt_base::error;
384  if (__state.good())
385  {
386  const descriptor_type& __desc = __state.in_descriptor();
387  const size_t __tmultiple = sizeof(intern_type);
388  size_t __tlen = __tmultiple * (__to_end - __to);
389 
390  // Argument list for iconv specifies a byte sequence. Thus,
391  // all to/from arrays must be brutally casted to char*.
392  char* __cto = reinterpret_cast<char*>(__to);
393  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
394  &__cto, &__tlen);
395 
396  if (__conv != size_t(-1))
397  {
398  __to_next = reinterpret_cast<extern_type*>(__cto);
399  if (__tlen == __tmultiple * (__to_end - __to))
400  __ret = codecvt_base::noconv;
401  else if (__tlen == 0)
402  __ret = codecvt_base::ok;
403  else
404  __ret = codecvt_base::partial;
405  }
406  else
407  __ret = codecvt_base::error;
408  }
409  return __ret;
410  }
411 
412  template<typename _InternT, typename _ExternT>
413  codecvt_base::result
414  codecvt<_InternT, _ExternT, encoding_state>::
415  do_in(state_type& __state, const extern_type* __from,
416  const extern_type* __from_end, const extern_type*& __from_next,
417  intern_type* __to, intern_type* __to_end,
418  intern_type*& __to_next) const
419  {
420  result __ret = codecvt_base::error;
421  if (__state.good())
422  {
423  const descriptor_type& __desc = __state.in_descriptor();
424  const size_t __fmultiple = sizeof(extern_type);
425  size_t __flen = __fmultiple * (__from_end - __from);
426  const size_t __tmultiple = sizeof(intern_type);
427  size_t __tlen = __tmultiple * (__to_end - __to);
428 
429  // Argument list for iconv specifies a byte sequence. Thus,
430  // all to/from arrays must be brutally casted to char*.
431  char* __cto = reinterpret_cast<char*>(__to);
432  char* __cfrom;
433  size_t __conv;
434 
435  // Some encodings need a byte order marker as the first item
436  // in the byte stream, to designate endian-ness. The default
437  // value for the byte order marker is NULL, so if this is
438  // the case, it's not necessary and we can just go on our
439  // merry way.
440  int __ext_bom = __state.external_bom();
441  if (__ext_bom)
442  {
443  size_t __size = __from_end - __from;
444  extern_type* __cfixed = static_cast<extern_type*>
445  (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
446  __cfixed[0] = static_cast<extern_type>(__ext_bom);
447  char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
448  __cfrom = reinterpret_cast<char*>(__cfixed);
449  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
450  &__flen, &__cto, &__tlen);
451  }
452  else
453  {
454  extern_type* __cfixed = const_cast<extern_type*>(__from);
455  __cfrom = reinterpret_cast<char*>(__cfixed);
456  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
457  &__flen, &__cto, &__tlen);
458  }
459 
460 
461  if (__conv != size_t(-1))
462  {
463  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
464  __to_next = reinterpret_cast<intern_type*>(__cto);
465  __ret = codecvt_base::ok;
466  }
467  else
468  {
469  if (__flen < static_cast<size_t>(__from_end - __from))
470  {
471  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
472  __to_next = reinterpret_cast<intern_type*>(__cto);
473  __ret = codecvt_base::partial;
474  }
475  else
476  __ret = codecvt_base::error;
477  }
478  }
479  return __ret;
480  }
481 
482  template<typename _InternT, typename _ExternT>
483  int
484  codecvt<_InternT, _ExternT, encoding_state>::
485  do_encoding() const throw()
486  {
487  int __ret = 0;
488  if (sizeof(_ExternT) <= sizeof(_InternT))
489  __ret = sizeof(_InternT) / sizeof(_ExternT);
490  return __ret;
491  }
492 
493  template<typename _InternT, typename _ExternT>
494  bool
495  codecvt<_InternT, _ExternT, encoding_state>::
496  do_always_noconv() const throw()
497  { return false; }
498 
499  template<typename _InternT, typename _ExternT>
500  int
501  codecvt<_InternT, _ExternT, encoding_state>::
502  do_length(state_type&, const extern_type* __from,
503  const extern_type* __end, size_t __max) const
504  { return std::min(__max, static_cast<size_t>(__end - __from)); }
505 
506  // _GLIBCXX_RESOLVE_LIB_DEFECTS
507  // 74. Garbled text for codecvt::do_max_length
508  template<typename _InternT, typename _ExternT>
509  int
510  codecvt<_InternT, _ExternT, encoding_state>::
511  do_max_length() const throw()
512  { return 1; }
513 
514 _GLIBCXX_END_NAMESPACE_VERSION
515 } // namespace
516 
517 #endif
ISO C++ entities toplevel namespace is std.
virtual result do_out(state_type &__state, const intern_type *__from, const intern_type *__from_end, const intern_type *&__from_next, extern_type *__to, extern_type *__to_end, extern_type *&__to_next) const
Convert from internal to external character set.
Class representing stream positions.
Definition: postypes.h:82
const _CharT * c_str() const noexcept
Return const pointer to null-terminated contents.
Definition: cow_string.h:2249
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233
GNU extensions for public use.
Primary class template codecvt.NB: Generic, mostly useless implementation.
Definition: codecvt.h:277
Basis for explicit traits specializations.
Definition: char_traits.h:323
Extension to use iconv for dealing with character encodings.
size_type size() const noexcept
Returns the number of characters in the string, not including any null-termination.
Definition: cow_string.h:908
Common base for codecvt functions.
Definition: codecvt.h:71
Facet ID class.The ID class provides facets with an index used to identify them. Every facet class mu...