My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes  
Changes to /trunk/java/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java
r0 vs. r425 Compare: vs.  Format:
Revision r425
Go to: 
Project members, sign in to write a code review
/trunk/java/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java /trunk/java/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java   r425
  1 // Protocol Buffers - Google's data interchange format
  2 // Copyright 2008 Google Inc. All rights reserved.
  3 // http://code.google.com/p/protobuf/
  4 //
  5 // Redistribution and use in source and binary forms, with or without
  6 // modification, are permitted provided that the following conditions are
  7 // met:
  8 //
  9 // * Redistributions of source code must retain the above copyright
  10 // notice, this list of conditions and the following disclaimer.
  11 // * Redistributions in binary form must reproduce the above
  12 // copyright notice, this list of conditions and the following disclaimer
  13 // in the documentation and/or other materials provided with the
  14 // distribution.
  15 // * Neither the name of Google Inc. nor the names of its
  16 // contributors may be used to endorse or promote products derived from
  17 // this software without specific prior written permission.
  18 //
  19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30
  31 package com.google.protobuf;
  32
  33 import static junit.framework.Assert.*;
  34
  35 import java.io.UnsupportedEncodingException;
  36 import java.util.ArrayList;
  37 import java.util.Arrays;
  38 import java.util.List;
  39 import java.util.Random;
  40 import java.util.logging.Logger;
  41 import java.nio.charset.CharsetDecoder;
  42 import java.nio.charset.Charset;
  43 import java.nio.charset.CodingErrorAction;
  44 import java.nio.charset.CharsetEncoder;
  45 import java.nio.charset.CoderResult;
  46 import java.nio.ByteBuffer;
  47 import java.nio.CharBuffer;
  48
  49 /**
  50 * Shared testing code for {@link IsValidUtf8Test} and
  51 * {@link IsValidUtf8FourByteTest}.
  52 *
  53 * @author jonp@google.com (Jon Perlow)
  54 * @author martinrb@google.com (Martin Buchholz)
  55 */
  56 class IsValidUtf8TestUtil {
  57 private static Logger logger = Logger.getLogger(
  58 IsValidUtf8TestUtil.class.getName());
  59
  60 // 128 - [chars 0x0000 to 0x007f]
  61 static long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
  62
  63 // 128
  64 static long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
  65 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
  66
  67 // 1920 [chars 0x0080 to 0x07FF]
  68 static long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
  69
  70 // 18,304
  71 static long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
  72 // Both bytes are one byte characters
  73 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
  74 // The possible number of two byte characters
  75 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
  76
  77 // 2048
  78 static long THREE_BYTE_SURROGATES = 2 * 1024;
  79
  80 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
  81 static long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
  82 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
  83
  84 // 2,650,112
  85 static long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
  86 // All one byte characters
  87 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
  88 // One two byte character and a one byte character
  89 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
  90 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
  91 // Three byte characters
  92 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
  93
  94 // 1,048,576 [chars 0x10000L to 0x10FFFF]
  95 static long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
  96
  97 // 289,571,839
  98 static long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
  99 // All one byte characters
  100 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
  101 // One and three byte characters
  102 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
  103 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
  104 // Two two byte characters
  105 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
  106 // Permutations of one and two byte characters
  107 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
  108 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
  109 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
  110 // Four byte characters
  111 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
  112
  113 static class Shard {
  114 final long index;
  115 final long start;
  116 final long lim;
  117 final long expected;
  118
  119
  120 public Shard(long index, long start, long lim, long expected) {
  121 assertTrue(start < lim);
  122 this.index = index;
  123 this.start = start;
  124 this.lim = lim;
  125 this.expected = expected;
  126 }
  127 }
  128
  129 static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
  130 generateFourByteShardsExpectedRunnables();
  131
  132 private static long[] generateFourByteShardsExpectedRunnables() {
  133 long[] expected = new long[128];
  134
  135 // 0-63 are all 5300224
  136 for (int i = 0; i <= 63; i++) {
  137 expected[i] = 5300224;
  138 }
  139
  140 // 97-111 are all 2342912
  141 for (int i = 97; i <= 111; i++) {
  142 expected[i] = 2342912;
  143 }
  144
  145 // 113-117 are all 1048576
  146 for (int i = 113; i <= 117; i++) {
  147 expected[i] = 1048576;
  148 }
  149
  150 // One offs
  151 expected[112] = 786432;
  152 expected[118] = 786432;
  153 expected[119] = 1048576;
  154 expected[120] = 458752;
  155 expected[121] = 524288;
  156 expected[122] = 65536;
  157
  158 // Anything not assigned was the default 0.
  159 return expected;
  160 }
  161
  162 static final List<Shard> FOUR_BYTE_SHARDS = generateFourByteShards(
  163 128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);
  164
  165
  166 private static List<Shard> generateFourByteShards(
  167 int numShards, long[] expected) {
  168 assertEquals(numShards, expected.length);
  169 List<Shard> shards = new ArrayList<Shard>(numShards);
  170 long LIM = 1L << 32;
  171 long increment = LIM / numShards;
  172 assertTrue(LIM % numShards == 0);
  173 for (int i = 0; i < numShards; i++) {
  174 shards.add(new Shard(i,
  175 increment * i,
  176 increment * (i + 1),
  177 expected[i]));
  178 }
  179 return shards;
  180 }
  181
  182 /**
  183 * Helper to run the loop to test all the permutations for the number of bytes
  184 * specified.
  185 *
  186 * @param numBytes the number of bytes in the byte array
  187 * @param expectedCount the expected number of roundtrippable permutations
  188 */
  189 static void testBytes(int numBytes, long expectedCount)
  190 throws UnsupportedEncodingException {
  191 testBytes(numBytes, expectedCount, 0, -1);
  192 }
  193
  194 /**
  195 * Helper to run the loop to test all the permutations for the number of bytes
  196 * specified. This overload is useful for debugging to get the loop to start
  197 * at a certain character.
  198 *
  199 * @param numBytes the number of bytes in the byte array
  200 * @param expectedCount the expected number of roundtrippable permutations
  201 * @param start the starting bytes encoded as a long as big-endian
  202 * @param lim the limit of bytes to process encoded as a long as big-endian,
  203 * or -1 to mean the max limit for numBytes
  204 */
  205 static void testBytes(int numBytes, long expectedCount, long start, long lim)
  206 throws UnsupportedEncodingException {
  207 Random rnd = new Random();
  208 byte[] bytes = new byte[numBytes];
  209
  210 if (lim == -1) {
  211 lim = 1L << (numBytes * 8);
  212 }
  213 long count = 0;
  214 long countRoundTripped = 0;
  215 for (long byteChar = start; byteChar < lim; byteChar++) {
  216 long tmpByteChar = byteChar;
  217 for (int i = 0; i < numBytes; i++) {
  218 bytes[bytes.length - i - 1] = (byte) tmpByteChar;
  219 tmpByteChar = tmpByteChar >> 8;
  220 }
  221 ByteString bs = ByteString.copyFrom(bytes);
  222 boolean isRoundTrippable = bs.isValidUtf8();
  223 String s = new String(bytes, "UTF-8");
  224 byte[] bytesReencoded = s.getBytes("UTF-8");
  225 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
  226
  227 if (bytesEqual != isRoundTrippable) {
  228 outputFailure(byteChar, bytes, bytesReencoded);
  229 }
  230
  231 // Check agreement with static Utf8 methods.
  232 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes));
  233 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes));
  234
  235 // Test partial sequences.
  236 // Partition numBytes into three segments (not necessarily non-empty).
  237 int i = rnd.nextInt(numBytes);
  238 int j = rnd.nextInt(numBytes);
  239 if (j < i) {
  240 int tmp = i; i = j; j = tmp;
  241 }
  242 int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i);
  243 int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j);
  244 int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes);
  245 if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
  246 System.out.printf("state=%04x %04x %04x i=%d j=%d%n",
  247 state1, state2, state3, i, j);
  248 outputFailure(byteChar, bytes, bytesReencoded);
  249 }
  250 assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));
  251
  252 // Test ropes built out of small partial sequences
  253 ByteString rope = RopeByteString.newInstanceForTest(
  254 bs.substring(0, i),
  255 RopeByteString.newInstanceForTest(
  256 bs.substring(i, j),
  257 bs.substring(j, numBytes)));
  258 assertSame(RopeByteString.class, rope.getClass());
  259
  260 ByteString[] byteStrings = { bs, bs.substring(0, numBytes), rope };
  261 for (ByteString x : byteStrings) {
  262 assertEquals(isRoundTrippable,
  263 x.isValidUtf8());
  264 assertEquals(state3,
  265 x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));
  266
  267 assertEquals(state1,
  268 x.partialIsValidUtf8(Utf8.COMPLETE, 0, i));
  269 assertEquals(state1,
  270 x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i));
  271 assertEquals(state2,
  272 x.partialIsValidUtf8(state1, i, j - i));
  273 assertEquals(state2,
  274 x.substring(i, j).partialIsValidUtf8(state1, 0, j - i));
  275 assertEquals(state3,
  276 x.partialIsValidUtf8(state2, j, numBytes - j));
  277 assertEquals(state3,
  278 x.substring(j, numBytes)
  279 .partialIsValidUtf8(state2, 0, numBytes - j));
  280 }
  281
  282 // ByteString reduplication should not affect its UTF-8 validity.
  283 ByteString ropeADope =
  284 RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
  285 assertEquals(isRoundTrippable, ropeADope.isValidUtf8());
  286
  287 if (isRoundTrippable) {
  288 countRoundTripped++;
  289 }
  290 count++;
  291 if (byteChar != 0 && byteChar % 1000000L == 0) {
  292 logger.info("Processed " + (byteChar / 1000000L) +
  293 " million characters");
  294 }
  295 }
  296 logger.info("Round tripped " + countRoundTripped + " of " + count);
  297 assertEquals(expectedCount, countRoundTripped);
  298 }
  299
  300 /**
  301 * Variation of {@link #testBytes} that does less allocation using the
  302 * low-level encoders/decoders directly. Checked in because it's useful for
  303 * debugging when trying to process bytes faster, but since it doesn't use the
  304 * actual String class, it's possible for incompatibilities to develop
  305 * (although unlikely).
  306 *
  307 * @param numBytes the number of bytes in the byte array
  308 * @param expectedCount the expected number of roundtrippable permutations
  309 * @param start the starting bytes encoded as a long as big-endian
  310 * @param lim the limit of bytes to process encoded as a long as big-endian,
  311 * or -1 to mean the max limit for numBytes
  312 */
  313 void testBytesUsingByteBuffers(
  314 int numBytes, long expectedCount, long start, long lim)
  315 throws UnsupportedEncodingException {
  316 CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
  317 .onMalformedInput(CodingErrorAction.REPLACE)
  318 .onUnmappableCharacter(CodingErrorAction.REPLACE);
  319 CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder()
  320 .onMalformedInput(CodingErrorAction.REPLACE)
  321 .onUnmappableCharacter(CodingErrorAction.REPLACE);
  322 byte[] bytes = new byte[numBytes];
  323 int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
  324 char[] charsDecoded =
  325 new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
  326 int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
  327 byte[] bytesReencoded = new byte[maxBytes];
  328
  329 ByteBuffer bb = ByteBuffer.wrap(bytes);
  330 CharBuffer cb = CharBuffer.wrap(charsDecoded);
  331 ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
  332 if (lim == -1) {
  333 lim = 1L << (numBytes * 8);
  334 }
  335 long count = 0;
  336 long countRoundTripped = 0;
  337 for (long byteChar = start; byteChar < lim; byteChar++) {
  338 bb.rewind();
  339 bb.limit(bytes.length);
  340 cb.rewind();
  341 cb.limit(charsDecoded.length);
  342 bbReencoded.rewind();
  343 bbReencoded.limit(bytesReencoded.length);
  344 encoder.reset();
  345 decoder.reset();
  346 long tmpByteChar = byteChar;
  347 for (int i = 0; i < bytes.length; i++) {
  348 bytes[bytes.length - i - 1] = (byte) tmpByteChar;
  349 tmpByteChar = tmpByteChar >> 8;
  350 }
  351 boolean isRoundTrippable = ByteString.copyFrom(bytes).isValidUtf8();
  352 CoderResult result = decoder.decode(bb, cb, true);
  353 assertFalse(result.isError());
  354 result = decoder.flush(cb);
  355 assertFalse(result.isError());
  356
  357 int charLen = cb.position();
  358 cb.rewind();
  359 cb.limit(charLen);
  360 result = encoder.encode(cb, bbReencoded, true);
  361 assertFalse(result.isError());
  362 result = encoder.flush(bbReencoded);
  363 assertFalse(result.isError());
  364
  365 boolean bytesEqual = true;
  366 int bytesLen = bbReencoded.position();
  367 if (bytesLen != numBytes) {
  368 bytesEqual = false;
  369 } else {
  370 for (int i = 0; i < numBytes; i++) {
  371 if (bytes[i] != bytesReencoded[i]) {
  372 bytesEqual = false;
  373 break;
  374 }
  375 }
  376 }
  377 if (bytesEqual != isRoundTrippable) {
  378 outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
  379 }
  380
  381 count++;
  382 if (isRoundTrippable) {
  383 countRoundTripped++;
  384 }
  385 if (byteChar != 0 && byteChar % 1000000 == 0) {
  386 logger.info("Processed " + (byteChar / 1000000) +
  387 " million characters");
  388 }
  389 }
  390 logger.info("Round tripped " + countRoundTripped + " of " + count);
  391 assertEquals(expectedCount, countRoundTripped);
  392 }
  393
  394 private static void outputFailure(long byteChar, byte[] bytes, byte[] after) {
  395 outputFailure(byteChar, bytes, after, after.length);
  396 }
  397
  398 private static void outputFailure(long byteChar, byte[] bytes, byte[] after,
  399 int len) {
  400 fail("Failure: (" + Long.toHexString(byteChar) + ") " +
  401 toHexString(bytes) + " => " + toHexString(after, len));
  402 }
  403
  404 private static String toHexString(byte[] b) {
  405 return toHexString(b, b.length);
  406 }
  407
  408 private static String toHexString(byte[] b, int len) {
  409 StringBuilder s = new StringBuilder();
  410 s.append("\"");
  411 for (int i = 0; i < len; i++) {
  412 if (i > 0) {
  413 s.append(" ");
  414 }
  415 s.append(String.format("%02x", b[i] & 0xFF));
  416 }
  417 s.append("\"");
  418 return s.toString();
  419 }
  420
  421 }
Powered by Google Project Hosting