1 /*
2 * #%L
3 * StringEncoder.java - mongodb-async-driver - Allanbank Consulting, Inc.
4 * %%
5 * Copyright (C) 2011 - 2014 Allanbank Consulting, Inc.
6 * %%
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 * #L%
19 */
20
21 package com.allanbank.mongodb.bson.io;
22
23 import java.io.IOException;
24 import java.io.OutputStream;
25
26 /**
27 * StringEncoder provides a single location for the string encoding and sizing
28 * logic. This class if backed by a cache of strings to the encoded bytes.
29 * <p>
30 * The cache is controlled via two parameters:
31 *
32 * @api.no This class is <b>NOT</b> part of the drivers API. This class may be
33 * mutated in incompatible ways between any two releases of the driver.
34 * @copyright 2013, Allanbank Consulting, Inc., All Rights Reserved
35 */
36 public class StringEncoder {
37
38 /**
39 * Returns the visitor's output buffer.
40 *
41 * @param string
42 * The 'C' string to determine the size of.
43 * @return The visitor's output buffer.
44 */
45 public static int computeCStringSize(final String string) {
46 return utf8Size(string) + 1;
47 }
48
49 /**
50 * Returns the visitor's output buffer.
51 *
52 * @param string
53 * The 'UTF8' string to determine the size of.
54 * @return The visitor's output buffer.
55 */
56 public static int computeStringSize(final String string) {
57 return 4 + utf8Size(string) + 1;
58 }
59
60 /**
61 * Computes the size of the encoded UTF8 String based on the table below.
62 *
63 * <pre>
64 * # Code Points Bytes
65 * 1 U+0000..U+007F 1
66 *
67 * 2 U+0080..U+07FF 2
68 *
69 * 3 U+0800..U+0FFF 3
70 * U+1000..U+FFFF
71 *
72 * 4 U+10000..U+3FFFF 4
73 * U+40000..U+FFFFF 4
74 * U+100000..U10FFFF 4
75 * </pre>
76 *
77 * @param string
78 * The string to determine the length of.
79 * @return The length of the string encoded as UTF8.
80 */
81 public static int utf8Size(final String string) {
82 final int strLength = (string == null) ? 0 : string.length();
83
84 int length = 0;
85 int codePoint;
86 for (int i = 0; i < strLength; i += Character.charCount(codePoint)) {
87 codePoint = Character.codePointAt(string, i);
88 if (codePoint < 0x80) {
89 length += 1;
90 }
91 else if (codePoint < 0x800) {
92 length += 2;
93 }
94 else if (codePoint < 0x10000) {
95 length += 3;
96 }
97 else {
98 length += 4;
99 }
100 }
101
102 return length;
103 }
104
105 /** A private buffer for encoding strings. */
106 private final byte[] myBuffer = new byte[1024];
107
108 /** The cache of strings to bytes. */
109 private final StringEncoderCache myCache;
110
111 /**
112 * Creates a new StringEncoder.
113 */
114 public StringEncoder() {
115 this(new StringEncoderCache());
116 }
117
118 /**
119 * Creates a new StringEncoder.
120 *
121 * @param cache
122 * The cache for the encoder to use.
123 */
124 public StringEncoder(final StringEncoderCache cache) {
125 myCache = cache;
126 }
127
128 /**
129 * Writes the string as a UTF-8 string. This method handles the
130 * "normal/easy" cases and delegates to the full character set if things get
131 * complicated.
132 *
133 * @param string
134 * The string to encode.
135 * @param out
136 * The stream to write to.
137 * @throws IOException
138 * On a failure to write the bytes.
139 */
140 public void encode(final String string, final OutputStream out)
141 throws IOException {
142
143 if (!string.isEmpty()) {
144 final byte[] encoded = myCache.find(string);
145
146 if (encoded == null) {
147 // Cache miss - write the bytes straight to the stream.
148 fastEncode(string, out);
149 }
150 else {
151 myCache.used(string, encoded, 0, encoded.length);
152 out.write(encoded);
153 }
154 }
155 }
156
157 /**
158 * Computes the size of the encoded UTF8 String based on the table below.
159 * This method may use a cached copy of the encoded string to determine the
160 * size.
161 *
162 * <pre>
163 * # Code Points Bytes
164 * 1 U+0000..U+007F 1
165 *
166 * 2 U+0080..U+07FF 2
167 *
168 * 3 U+0800..U+0FFF 3
169 * U+1000..U+FFFF
170 *
171 * 4 U+10000..U+3FFFF 4
172 * U+40000..U+FFFFF 4
173 * U+100000..U10FFFF 4
174 * </pre>
175 *
176 * @param string
177 * The string to determine the length of.
178 * @return The length of the string encoded as UTF8.
179 */
180 public int encodeSize(final String string) {
181 if (string.isEmpty()) {
182 return 0;
183 }
184
185 final byte[] cached = myCache.find(string);
186 if (cached != null) {
187 // Don't count this as a usage. Just bonus speed.
188 return cached.length;
189 }
190 return utf8Size(string);
191 }
192
193 /**
194 * Returns the cache value.
195 *
196 * @return The cache value.
197 * @deprecated The cache {@link StringEncoderCache} should be controlled
198 * directly. This method will be removed after the 2.1.0
199 * release.
200 */
201 @Deprecated
202 public StringEncoderCache getCache() {
203 return myCache;
204 }
205
206 /**
207 * Writes the string as a UTF-8 string. This method handles the
208 * "normal/easy" cases and delegates to the full character set if things get
209 * complicated.
210 *
211 * @param string
212 * The string to encode.
213 * @param out
214 * The stream to write to.
215 * @throws IOException
216 * On a failure to write the bytes.
217 */
218 protected void fastEncode(final String string, final OutputStream out)
219 throws IOException {
220 // 4 = max encoded bytes/code point.
221 final int writeUpTo = myBuffer.length - 4;
222 final int strLength = string.length();
223
224 boolean bufferHasAllBytes = true;
225
226 int bufferOffset = 0;
227 int codePoint;
228 for (int i = 0; i < strLength; i += Character.charCount(codePoint)) {
229
230 // Check for buffer overflow.
231 if (writeUpTo < bufferOffset) {
232 bufferHasAllBytes = false;
233 if (out != null) {
234 out.write(myBuffer, 0, bufferOffset);
235 }
236 bufferOffset = 0;
237 }
238
239 codePoint = Character.codePointAt(string, i);
240 if (codePoint < 0x80) {
241 myBuffer[bufferOffset++] = (byte) codePoint;
242 }
243 else if (codePoint < 0x800) {
244 myBuffer[bufferOffset++] = (byte) (0xC0 + ((codePoint >> 6) & 0xFF));
245 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 0) & 0x3F));
246 }
247 else if (codePoint < 0x10000) {
248 myBuffer[bufferOffset++] = (byte) (0xE0 + ((codePoint >> 12) & 0xFF));
249 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 6) & 0x3F));
250 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 0) & 0x3f));
251 }
252 else {
253 myBuffer[bufferOffset++] = (byte) (0xF0 + ((codePoint >> 18) & 0xFF));
254 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 12) & 0x3F));
255 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 6) & 0x3F));
256 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 0) & 0x3F));
257 }
258 }
259
260 // Write out the final results.
261 if (out != null) {
262 out.write(myBuffer, 0, bufferOffset);
263 }
264
265 // ... and try and save it in the cache.
266 if (bufferHasAllBytes) {
267 myCache.used(string, myBuffer, 0, bufferOffset);
268 }
269 }
270 }