View Javadoc
1   /*
2    * #%L
3    * StringEncoder.java - mongodb-async-driver - Allanbank Consulting, Inc.
4    * %%
5    * Copyright (C) 2011 - 2014 Allanbank Consulting, Inc.
6    * %%
7    * Licensed under the Apache License, Version 2.0 (the "License");
8    * you may not use this file except in compliance with the License.
9    * You may obtain a copy of the License at
10   * 
11   *      http://www.apache.org/licenses/LICENSE-2.0
12   * 
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   * #L%
19   */
20  
21  package com.allanbank.mongodb.bson.io;
22  
23  import java.io.IOException;
24  import java.io.OutputStream;
25  
26  /**
27   * StringEncoder provides a single location for the string encoding and sizing
28   * logic. This class if backed by a cache of strings to the encoded bytes.
29   * <p>
30   * The cache is controlled via two parameters:
31   * 
32   * @api.no This class is <b>NOT</b> part of the drivers API. This class may be
33   *         mutated in incompatible ways between any two releases of the driver.
34   * @copyright 2013, Allanbank Consulting, Inc., All Rights Reserved
35   */
36  public class StringEncoder {
37  
38      /**
39       * Returns the visitor's output buffer.
40       * 
41       * @param string
42       *            The 'C' string to determine the size of.
43       * @return The visitor's output buffer.
44       */
45      public static int computeCStringSize(final String string) {
46          return utf8Size(string) + 1;
47      }
48  
49      /**
50       * Returns the visitor's output buffer.
51       * 
52       * @param string
53       *            The 'UTF8' string to determine the size of.
54       * @return The visitor's output buffer.
55       */
56      public static int computeStringSize(final String string) {
57          return 4 + utf8Size(string) + 1;
58      }
59  
60      /**
61       * Computes the size of the encoded UTF8 String based on the table below.
62       * 
63       * <pre>
64       * #    Code Points      Bytes
65       * 1    U+0000..U+007F   1
66       * 
67       * 2    U+0080..U+07FF   2
68       * 
69       * 3    U+0800..U+0FFF   3
70       *      U+1000..U+FFFF
71       * 
72       * 4   U+10000..U+3FFFF  4
73       *     U+40000..U+FFFFF  4
74       *    U+100000..U10FFFF  4
75       * </pre>
76       * 
77       * @param string
78       *            The string to determine the length of.
79       * @return The length of the string encoded as UTF8.
80       */
81      public static int utf8Size(final String string) {
82          final int strLength = (string == null) ? 0 : string.length();
83  
84          int length = 0;
85          int codePoint;
86          for (int i = 0; i < strLength; i += Character.charCount(codePoint)) {
87              codePoint = Character.codePointAt(string, i);
88              if (codePoint < 0x80) {
89                  length += 1;
90              }
91              else if (codePoint < 0x800) {
92                  length += 2;
93              }
94              else if (codePoint < 0x10000) {
95                  length += 3;
96              }
97              else {
98                  length += 4;
99              }
100         }
101 
102         return length;
103     }
104 
105     /** A private buffer for encoding strings. */
106     private final byte[] myBuffer = new byte[1024];
107 
108     /** The cache of strings to bytes. */
109     private final StringEncoderCache myCache;
110 
111     /**
112      * Creates a new StringEncoder.
113      */
114     public StringEncoder() {
115         this(new StringEncoderCache());
116     }
117 
118     /**
119      * Creates a new StringEncoder.
120      * 
121      * @param cache
122      *            The cache for the encoder to use.
123      */
124     public StringEncoder(final StringEncoderCache cache) {
125         myCache = cache;
126     }
127 
128     /**
129      * Writes the string as a UTF-8 string. This method handles the
130      * "normal/easy" cases and delegates to the full character set if things get
131      * complicated.
132      * 
133      * @param string
134      *            The string to encode.
135      * @param out
136      *            The stream to write to.
137      * @throws IOException
138      *             On a failure to write the bytes.
139      */
140     public void encode(final String string, final OutputStream out)
141             throws IOException {
142 
143         if (!string.isEmpty()) {
144             final byte[] encoded = myCache.find(string);
145 
146             if (encoded == null) {
147                 // Cache miss - write the bytes straight to the stream.
148                 fastEncode(string, out);
149             }
150             else {
151                 myCache.used(string, encoded, 0, encoded.length);
152                 out.write(encoded);
153             }
154         }
155     }
156 
157     /**
158      * Computes the size of the encoded UTF8 String based on the table below.
159      * This method may use a cached copy of the encoded string to determine the
160      * size.
161      * 
162      * <pre>
163      * #    Code Points      Bytes
164      * 1    U+0000..U+007F   1
165      * 
166      * 2    U+0080..U+07FF   2
167      * 
168      * 3    U+0800..U+0FFF   3
169      *      U+1000..U+FFFF
170      * 
171      * 4   U+10000..U+3FFFF  4
172      *     U+40000..U+FFFFF  4
173      *    U+100000..U10FFFF  4
174      * </pre>
175      * 
176      * @param string
177      *            The string to determine the length of.
178      * @return The length of the string encoded as UTF8.
179      */
180     public int encodeSize(final String string) {
181         if (string.isEmpty()) {
182             return 0;
183         }
184 
185         final byte[] cached = myCache.find(string);
186         if (cached != null) {
187             // Don't count this as a usage. Just bonus speed.
188             return cached.length;
189         }
190         return utf8Size(string);
191     }
192 
193     /**
194      * Returns the cache value.
195      * 
196      * @return The cache value.
197      * @deprecated The cache {@link StringEncoderCache} should be controlled
198      *             directly. This method will be removed after the 2.1.0
199      *             release.
200      */
201     @Deprecated
202     public StringEncoderCache getCache() {
203         return myCache;
204     }
205 
206     /**
207      * Writes the string as a UTF-8 string. This method handles the
208      * "normal/easy" cases and delegates to the full character set if things get
209      * complicated.
210      * 
211      * @param string
212      *            The string to encode.
213      * @param out
214      *            The stream to write to.
215      * @throws IOException
216      *             On a failure to write the bytes.
217      */
218     protected void fastEncode(final String string, final OutputStream out)
219             throws IOException {
220         // 4 = max encoded bytes/code point.
221         final int writeUpTo = myBuffer.length - 4;
222         final int strLength = string.length();
223 
224         boolean bufferHasAllBytes = true;
225 
226         int bufferOffset = 0;
227         int codePoint;
228         for (int i = 0; i < strLength; i += Character.charCount(codePoint)) {
229 
230             // Check for buffer overflow.
231             if (writeUpTo < bufferOffset) {
232                 bufferHasAllBytes = false;
233                 if (out != null) {
234                     out.write(myBuffer, 0, bufferOffset);
235                 }
236                 bufferOffset = 0;
237             }
238 
239             codePoint = Character.codePointAt(string, i);
240             if (codePoint < 0x80) {
241                 myBuffer[bufferOffset++] = (byte) codePoint;
242             }
243             else if (codePoint < 0x800) {
244                 myBuffer[bufferOffset++] = (byte) (0xC0 + ((codePoint >> 6) & 0xFF));
245                 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 0) & 0x3F));
246             }
247             else if (codePoint < 0x10000) {
248                 myBuffer[bufferOffset++] = (byte) (0xE0 + ((codePoint >> 12) & 0xFF));
249                 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 6) & 0x3F));
250                 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 0) & 0x3f));
251             }
252             else {
253                 myBuffer[bufferOffset++] = (byte) (0xF0 + ((codePoint >> 18) & 0xFF));
254                 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 12) & 0x3F));
255                 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 6) & 0x3F));
256                 myBuffer[bufferOffset++] = (byte) (0x80 + ((codePoint >> 0) & 0x3F));
257             }
258         }
259 
260         // Write out the final results.
261         if (out != null) {
262             out.write(myBuffer, 0, bufferOffset);
263         }
264 
265         // ... and try and save it in the cache.
266         if (bufferHasAllBytes) {
267             myCache.used(string, myBuffer, 0, bufferOffset);
268         }
269     }
270 }