toUtf.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9#include "squid.h"
10#include "auth/toUtf.h"
11#include "sbuf/SBuf.h"
12
13#include <limits>
14
15SBuf
16Latin1ToUtf8(const char *in)
17{
18 SBuf result;
19
20 if (!in)
21 return result;
22
23 for (; *in; in++) {
24 const auto ch = static_cast<unsigned char>(*in);
25
26 if (ch < 0x80) {
27 result.append(ch);
28 } else {
29 result.append(static_cast<char>((ch >> 6) | 0xc0));
30 result.append(static_cast<char>((ch & 0x3f) | 0x80));
31 }
32 }
33 return result;
34}
35
36SBuf
37Cp1251ToUtf8(const char *in)
38{
39 static const unsigned char firstByteMark[] = { 0x00, 0x00, 0xC0, 0xE0 };
40 static const unsigned unicodevalues[] = {
41 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
42 0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
43 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
44 0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
45 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
46 0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
47 0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
48 0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
49 };
50 SBuf result;
51
52 if (!in)
53 return result;
54
55 for (; *in; in++) {
56 const auto ch = static_cast<unsigned char>(*in);
57 unsigned u = 0;
58 size_t bytesToWrite = 0;
59 char sequence[4] = {0, 0, 0, 0};
60
61 static_assert(std::numeric_limits<unsigned char>::max() == 0xFFu,
62 "we require char to be exactly 8 bits");
63 if (ch < 0x80)
64 u = ch;
65 else if (ch >= 0xC0) // 0x0410..0x044F
66 u = 0x0350 + ch;
67 else
68 u = unicodevalues[ch - 0x80];
69
70 if (u < 0x80)
71 bytesToWrite = 1;
72 else if (u < 0x800)
73 bytesToWrite = 2;
74 else
75 bytesToWrite = 3;
76
77 switch (bytesToWrite) {
78 case 3:
79 sequence[2] = static_cast<char>(u & 0x3f) | 0x80;
80 u >>= 6;
81 [[fallthrough]];
82 case 2:
83 sequence[1] = static_cast<char>(u & 0x3f) | 0x80;
84 u >>= 6;
85 [[fallthrough]];
86 case 1:
87 sequence[0] = static_cast<char>(u) | firstByteMark[bytesToWrite];
88 }
89 result.append(sequence, bytesToWrite);
90 }
91 return result;
92}
93
100static inline size_t
102{
103 if ((b0 & 0x80) == 0)
104 return 1;
105 if ((b0 & 0xC0) != 0xC0)
106 return 0; // invalid code point
107 if ((b0 & 0xE0) == 0xC0)
108 return 2;
109 if ((b0 & 0xF0) == 0xE0)
110 return 3;
111 if ((b0 & 0xF8) == 0xF0)
112 return 4;
113 return 0; // invalid code point
114}
115
122static bool
123isValidUtf8CodePoint(const unsigned char* source, const size_t length)
124{
125 unsigned char a;
126 const unsigned char* srcptr = source + length;
127 switch (length) {
128 default:
129 return false;
130 // Everything else falls through when "true"...
131 case 4:
132 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
133 [[fallthrough]];
134 case 3:
135 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
136 [[fallthrough]];
137 case 2:
138 if ((a = (*--srcptr)) > 0xBF) return false;
139
140 switch (*source) {
141 // no fall-through in this inner switch
142 case 0xE0:
143 if (a < 0xA0) return false;
144 break;
145 case 0xED:
146 if (a > 0x9F) return false;
147 break;
148 case 0xF0:
149 if (a < 0x90) return false;
150 break;
151 case 0xF4:
152 if (a > 0x8F) return false;
153 break;
154 default:
155 if (a < 0x80) return false;
156 break;
157 }
158 [[fallthrough]];
159
160 case 1:
161 if (*source >= 0x80 && *source < 0xC2) return false;
162 }
163 if (*source > 0xF4)
164 return false;
165 return true;
166}
167
171bool
172isValidUtf8String(const char *source, const char *sourceEnd) {
173 while (source < sourceEnd) {
174 const auto length = utf8CodePointLength(*source);
175 if (source + length > sourceEnd || !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source), length))
176 return false;
177 source += length;
178 }
179 return true; // including zero-length input
180}
181
Definition: SBuf.h:94
SBuf & append(const SBuf &S)
Definition: SBuf.cc:185
A const & max(A const &lhs, A const &rhs)
static size_t utf8CodePointLength(const char b0)
Definition: toUtf.cc:101
SBuf Cp1251ToUtf8(const char *in)
converts CP1251 to UTF-8
Definition: toUtf.cc:37
SBuf Latin1ToUtf8(const char *in)
converts ISO-LATIN-1 to UTF-8
Definition: toUtf.cc:16
static bool isValidUtf8CodePoint(const unsigned char *source, const size_t length)
Definition: toUtf.cc:123
bool isValidUtf8String(const char *source, const char *sourceEnd)
returns whether the given input is a valid (or empty) sequence of UTF-8 code points
Definition: toUtf.cc:172

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors