From e60096c377d8a3cb5bed3992352779195be95bb4 Mon Sep 17 00:00:00 2001 From: belugabehr <12578579+belugabehr@users.noreply.github.com> Date: Fri, 24 Jul 2020 05:37:28 -0400 Subject: [PATCH] HADOOP-17141. Add Capability To Get Text Length (#2157) Contributed by David Mollitor --- .../main/java/org/apache/hadoop/io/Text.java | 18 ++++++++++++++++++ .../java/org/apache/hadoop/io/TestText.java | 10 ++++++++++ 2 files changed, 28 insertions(+) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Text.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Text.java index 716de3deb4..6022b99544 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Text.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Text.java @@ -77,6 +77,7 @@ protected CharsetDecoder initialValue() { private byte[] bytes = EMPTY_BYTES; private int length = 0; + private int textLength = -1; /** * Construct an empty text string. @@ -131,6 +132,17 @@ public int getLength() { return length; } + /** + * Returns the length of this text. The length is equal to the number of + * Unicode code units in the text. + */ + public int getTextLength() { + if (textLength < 0) { + textLength = toString().length(); + } + return textLength; + } + /** * Returns the Unicode Scalar Value (32-bit integer value) * for the character at position. Note that this @@ -204,6 +216,7 @@ public void set(String string) { ByteBuffer bb = encode(string, true); bytes = bb.array(); length = bb.limit(); + textLength = string.length(); } catch (CharacterCodingException e) { throw new RuntimeException("Should not have happened", e); } @@ -221,6 +234,7 @@ public void set(byte[] utf8) { */ public void set(Text other) { set(other.getBytes(), 0, other.getLength()); + this.textLength = other.textLength; } /** @@ -234,6 +248,7 @@ public void set(byte[] utf8, int start, int len) { ensureCapacity(len); System.arraycopy(utf8, start, bytes, 0, len); this.length = len; + this.textLength = -1; } /** @@ -251,6 +266,7 @@ public void append(byte[] utf8, int start, int len) { } System.arraycopy(utf8, start, bytes, length, len); length += len; + textLength = -1; } /** @@ -263,6 +279,7 @@ public void append(byte[] utf8, int start, int len) { */ public void clear() { length = 0; + textLength = -1; } /** @@ -327,6 +344,7 @@ public void readWithKnownLength(DataInput in, int len) throws IOException { ensureCapacity(len); in.readFully(bytes, 0, len); length = len; + textLength = -1; } /** diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestText.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestText.java index 54df39955d..700e106271 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestText.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestText.java @@ -268,6 +268,8 @@ public void testClear() throws Exception { 0, text.getBytes().length); assertEquals("String's length must be zero", 0, text.getLength()); + assertEquals("String's text length must be zero", + 0, text.getTextLength()); // Test if clear works as intended text = new Text("abcd\u20acbdcd\u20ac"); @@ -280,6 +282,8 @@ public void testClear() throws Exception { text.getBytes().length >= len); assertEquals("Length of the string must be reset to 0 after clear()", 0, text.getLength()); + assertEquals("Text length of the string must be reset to 0 after clear()", + 0, text.getTextLength()); } @Test @@ -288,9 +292,12 @@ public void testTextText() throws CharacterCodingException { Text b=new Text("a"); b.set(a); assertEquals("abc", b.toString()); + assertEquals(3, a.getTextLength()); + assertEquals(3, b.getTextLength()); a.append("xdefgxxx".getBytes(), 1, 4); assertEquals("modified aliased string", "abc", b.toString()); assertEquals("appended string incorrectly", "abcdefg", a.toString()); + assertEquals("This should reflect in the lenght", 7, a.getTextLength()); // add an extra byte so that capacity = 10 and length = 8 a.append(new byte[]{'d'}, 0, 1); assertEquals(10, a.getBytes().length); @@ -392,16 +399,19 @@ public void testReadWithKnownLength() throws IOException { in.reset(inputBytes, inputBytes.length); text.readWithKnownLength(in, 5); assertEquals("hello", text.toString()); + assertEquals(5, text.getTextLength()); // Read longer length, make sure it lengthens in.reset(inputBytes, inputBytes.length); text.readWithKnownLength(in, 7); assertEquals("hello w", text.toString()); + assertEquals(7, text.getTextLength()); // Read shorter length, make sure it shortens in.reset(inputBytes, inputBytes.length); text.readWithKnownLength(in, 2); assertEquals("he", text.toString()); + assertEquals(2, text.getTextLength()); } /**