1  
//
1  
//
2  
// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
2  
// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
3  
//
3  
//
4  
// Distributed under the Boost Software License, Version 1.0. (See accompanying
4  
// Distributed under the Boost Software License, Version 1.0. (See accompanying
5  
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5  
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6  
//
6  
//
7  
// Official repository: https://github.com/boostorg/json
7  
// Official repository: https://github.com/boostorg/json
8  
//
8  
//
9  

9  

10  
#ifndef BOOST_JSON_DETAIL_UTF8_HPP
10  
#ifndef BOOST_JSON_DETAIL_UTF8_HPP
11  
#define BOOST_JSON_DETAIL_UTF8_HPP
11  
#define BOOST_JSON_DETAIL_UTF8_HPP
12  

12  

13  
#include <boost/endian/conversion.hpp>
13  
#include <boost/endian/conversion.hpp>
14  
#include <boost/json/detail/config.hpp>
14  
#include <boost/json/detail/config.hpp>
15  

15  

16  
#include <cstddef>
16  
#include <cstddef>
17  
#include <cstring>
17  
#include <cstring>
18  
#include <cstdint>
18  
#include <cstdint>
19  

19  

20  
namespace boost {
20  
namespace boost {
21  
namespace json {
21  
namespace json {
22  
namespace detail {
22  
namespace detail {
23  

23  

24  
template<int N>
24  
template<int N>
25  
std::uint32_t
25  
std::uint32_t
26  
load_little_endian(void const* p)
26  
load_little_endian(void const* p)
27  
{
27  
{
28  
    std::uint32_t v = 0;
28  
    std::uint32_t v = 0;
29  
    std::memcpy(&v, p, N);
29  
    std::memcpy(&v, p, N);
30  
    endian::little_to_native_inplace(v);
30  
    endian::little_to_native_inplace(v);
31  
    return v;
31  
    return v;
32  
}
32  
}
33  

33  

34  
inline
34  
inline
35  
uint16_t
35  
uint16_t
36  
classify_utf8(char c)
36  
classify_utf8(char c)
37  
{
37  
{
38  
    // 0x000 = invalid
38  
    // 0x000 = invalid
39  
    // 0x102 = 2 bytes, second byte [80, BF]
39  
    // 0x102 = 2 bytes, second byte [80, BF]
40  
    // 0x203 = 3 bytes, second byte [A0, BF]
40  
    // 0x203 = 3 bytes, second byte [A0, BF]
41  
    // 0x303 = 3 bytes, second byte [80, BF]
41  
    // 0x303 = 3 bytes, second byte [80, BF]
42  
    // 0x403 = 3 bytes, second byte [80, 9F]
42  
    // 0x403 = 3 bytes, second byte [80, 9F]
43  
    // 0x504 = 4 bytes, second byte [90, BF]
43  
    // 0x504 = 4 bytes, second byte [90, BF]
44  
    // 0x604 = 4 bytes, second byte [80, BF]
44  
    // 0x604 = 4 bytes, second byte [80, BF]
45  
    // 0x704 = 4 bytes, second byte [80, 8F]
45  
    // 0x704 = 4 bytes, second byte [80, 8F]
46  
    static constexpr uint16_t first[128]
46  
    static constexpr uint16_t first[128]
47  
    {
47  
    {
48  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
48  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
49  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
49  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
50  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
50  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
51  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
51  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
52  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
52  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
53  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
53  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
54  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
54  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
55  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
55  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
56  

56  

57  
       0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
57  
       0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
58  
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
58  
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
59  
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
59  
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
60  
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
60  
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
61  
       0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
61  
       0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
62  
       0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
62  
       0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
63  
       0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
63  
       0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
64  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
64  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
65  
    };
65  
    };
66  
    return first[static_cast<unsigned char>(c & 0x7F)];
66  
    return first[static_cast<unsigned char>(c & 0x7F)];
67  
}
67  
}
68  

68  

69  
inline
69  
inline
70  
bool
70  
bool
71  
is_valid_utf8(const char* p, uint16_t first)
71  
is_valid_utf8(const char* p, uint16_t first)
72  
{
72  
{
73  
    uint32_t v;
73  
    uint32_t v;
74  
    switch(first >> 8)
74  
    switch(first >> 8)
75  
    {
75  
    {
76  
    default:
76  
    default:
77  
        return false;
77  
        return false;
78  

78  

79  
    // 2 bytes, second byte [80, BF]
79  
    // 2 bytes, second byte [80, BF]
80  
    case 1:
80  
    case 1:
81  
        v = load_little_endian<2>(p);
81  
        v = load_little_endian<2>(p);
82  
        return (v & 0xC000) == 0x8000;
82  
        return (v & 0xC000) == 0x8000;
83  

83  

84  
    // 3 bytes, second byte [A0, BF]
84  
    // 3 bytes, second byte [A0, BF]
85  
    case 2:
85  
    case 2:
86  
        v = load_little_endian<3>(p);
86  
        v = load_little_endian<3>(p);
87  
        return (v & 0xC0E000) == 0x80A000;
87  
        return (v & 0xC0E000) == 0x80A000;
88  

88  

89  
    // 3 bytes, second byte [80, BF]
89  
    // 3 bytes, second byte [80, BF]
90  
    case 3:
90  
    case 3:
91  
        v = load_little_endian<3>(p);
91  
        v = load_little_endian<3>(p);
92  
        return (v & 0xC0C000) == 0x808000;
92  
        return (v & 0xC0C000) == 0x808000;
93  

93  

94  
    // 3 bytes, second byte [80, 9F]
94  
    // 3 bytes, second byte [80, 9F]
95  
    case 4:
95  
    case 4:
96  
        v = load_little_endian<3>(p);
96  
        v = load_little_endian<3>(p);
97  
        return (v & 0xC0E000) == 0x808000;
97  
        return (v & 0xC0E000) == 0x808000;
98  

98  

99  
    // 4 bytes, second byte [90, BF]
99  
    // 4 bytes, second byte [90, BF]
100  
    case 5:
100  
    case 5:
101  
        v = load_little_endian<4>(p);
101  
        v = load_little_endian<4>(p);
102  
        return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
102  
        return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
103  

103  

104  
    // 4 bytes, second byte [80, BF]
104  
    // 4 bytes, second byte [80, BF]
105  
    case 6:
105  
    case 6:
106  
        v = load_little_endian<4>(p);
106  
        v = load_little_endian<4>(p);
107  
        return (v & 0xC0C0C000) == 0x80808000;
107  
        return (v & 0xC0C0C000) == 0x80808000;
108  

108  

109  
    // 4 bytes, second byte [80, 8F]
109  
    // 4 bytes, second byte [80, 8F]
110  
    case 7:
110  
    case 7:
111  
        v = load_little_endian<4>(p);
111  
        v = load_little_endian<4>(p);
112  
        return (v & 0xC0C0F000) == 0x80808000;
112  
        return (v & 0xC0C0F000) == 0x80808000;
113  
    }
113  
    }
114  
}
114  
}
115  

115  

116  
class utf8_sequence
116  
class utf8_sequence
117  
{
117  
{
118  
    char seq_[4];
118  
    char seq_[4];
119  
    uint16_t first_;
119  
    uint16_t first_;
120  
    uint8_t size_;
120  
    uint8_t size_;
121  

121  

122  
public:
122  
public:
123  
    void
123  
    void
124  
    save(
124  
    save(
125  
        const char* p,
125  
        const char* p,
126  
        std::size_t remain) noexcept
126  
        std::size_t remain) noexcept
127  
    {
127  
    {
128  
        first_ = classify_utf8(*p );
128  
        first_ = classify_utf8(*p );
129  
        if(remain >= length())
129  
        if(remain >= length())
130  
            size_ = length();
130  
            size_ = length();
131  
        else
131  
        else
132  
            size_ = static_cast<uint8_t>(remain);
132  
            size_ = static_cast<uint8_t>(remain);
133  
        std::memcpy(seq_, p, size_);
133  
        std::memcpy(seq_, p, size_);
134  
    }
134  
    }
135  

135  

136  
    uint8_t
136  
    uint8_t
137  
    length() const noexcept
137  
    length() const noexcept
138  
    {
138  
    {
139  
        return first_ & 0xFF;
139  
        return first_ & 0xFF;
140  
    }
140  
    }
141  

141  

142  
    bool
142  
    bool
143  
    complete() const noexcept
143  
    complete() const noexcept
144  
    {
144  
    {
145  
        return size_ >= length();
145  
        return size_ >= length();
146  
    }
146  
    }
147  

147  

148  
    // returns true if complete
148  
    // returns true if complete
149  
    bool
149  
    bool
150  
    append(
150  
    append(
151  
        const char* p,
151  
        const char* p,
152  
        std::size_t remain) noexcept
152  
        std::size_t remain) noexcept
153  
    {
153  
    {
154  
        if(BOOST_JSON_UNLIKELY(needed() == 0))
154  
        if(BOOST_JSON_UNLIKELY(needed() == 0))
155  
            return true;
155  
            return true;
156  
        if(BOOST_JSON_LIKELY(remain >= needed()))
156  
        if(BOOST_JSON_LIKELY(remain >= needed()))
157  
        {
157  
        {
158  
            std::memcpy(
158  
            std::memcpy(
159  
                seq_ + size_, p, needed());
159  
                seq_ + size_, p, needed());
160  
            size_ = length();
160  
            size_ = length();
161  
            return true;
161  
            return true;
162  
        }
162  
        }
163  
        if(BOOST_JSON_LIKELY(remain > 0))
163  
        if(BOOST_JSON_LIKELY(remain > 0))
164  
        {
164  
        {
165  
            std::memcpy(seq_ + size_, p, remain);
165  
            std::memcpy(seq_ + size_, p, remain);
166  
            size_ += static_cast<uint8_t>(remain);
166  
            size_ += static_cast<uint8_t>(remain);
167  
        }
167  
        }
168  
        return false;
168  
        return false;
169  
    }
169  
    }
170  

170  

171  
    const char*
171  
    const char*
172  
    data() const noexcept
172  
    data() const noexcept
173  
    {
173  
    {
174  
        return seq_;
174  
        return seq_;
175  
    }
175  
    }
176  

176  

177  
    uint8_t
177  
    uint8_t
178  
    needed() const noexcept
178  
    needed() const noexcept
179  
    {
179  
    {
180  
        return length() - size_;
180  
        return length() - size_;
181  
    }
181  
    }
182  

182  

183  
    bool
183  
    bool
184  
    valid() const noexcept
184  
    valid() const noexcept
185  
    {
185  
    {
186  
        BOOST_ASSERT(size_ >= length());
186  
        BOOST_ASSERT(size_ >= length());
187  
        return is_valid_utf8(seq_, first_);
187  
        return is_valid_utf8(seq_, first_);
188  
    }
188  
    }
189  
};
189  
};
190  

190  

191  
} // detail
191  
} // detail
192  
} // namespace json
192  
} // namespace json
193  
} // namespace boost
193  
} // namespace boost
194  

194  

195  
#endif
195  
#endif