HADOOP-8655. Fix TextInputFormat for large deliminators. (Gelesh via bobby)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1376592 13f79535-47bb-0310-9956-ffa450edef68
2012-08-23 16:58:36 +00:00 · 2012-08-23 16:58:36 +00:00 · 34bd9794f7
commit 34bd9794f7
parent 42beb56a2e
3 changed files with 179 additions and 30 deletions
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@ -832,6 +832,9 @@ Release 2.0.0-alpha - 05-23-2012
    HADOOP-7868. Hadoop native fails to compile when default linker
    option is -Wl,--as-needed. (Trevor Robinson via eli)
    HADOOP-8655. Fix TextInputFormat for large deliminators. (Gelesh via
    bobby) 
 Release 0.23.3 - UNRELEASED
  INCOMPATIBLE CHANGES
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
@ -204,12 +204,14 @@ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
      int startPosn = bufferPosn; //starting from where we left off the last time
      if (bufferPosn >= bufferLength) {
        startPosn = bufferPosn = 0;
-        if (prevCharCR)
+        if (prevCharCR) {
          ++bytesConsumed; //account for CR from previous read
        }
        bufferLength = in.read(buffer);
-        if (bufferLength <= 0)
+        if (bufferLength <= 0) {
          break; // EOF
        }
      }
      for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
        if (buffer[bufferPosn] == LF) {
          newlineLength = (prevCharCR) ? 2 : 1;
@ -223,8 +225,9 @@ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
        prevCharCR = (buffer[bufferPosn] == CR);
      }
      int readLength = bufferPosn - startPosn;
-      if (prevCharCR && newlineLength == 0)
+      if (prevCharCR && newlineLength == 0) {
        --readLength; //CR at the end of the buffer
      }
      bytesConsumed += readLength;
      int appendLength = readLength - newlineLength;
      if (appendLength > maxLineLength - txtLength) {
@ -236,8 +239,9 @@ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
      }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
-    if (bytesConsumed > (long)Integer.MAX_VALUE)
+    if (bytesConsumed > (long)Integer.MAX_VALUE) {
      throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int)bytesConsumed;
  }
@ -246,19 +250,57 @@ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
   */
  private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
      throws IOException {
   /* We're reading data from inputStream, but the head of the stream may be
    *  already captured in the previous buffer, so we have several cases:
    * 
    * 1. The buffer tail does not contain any character sequence which
    *    matches with the head of delimiter. We count it as a 
    *    ambiguous byte count = 0
    *    
    * 2. The buffer tail contains a X number of characters,
    *    that forms a sequence, which matches with the
    *    head of delimiter. We count ambiguous byte count = X
    *    
    *    // ***  eg: A segment of input file is as follows
    *    
    *    " record 1792: I found this bug very interesting and
    *     I have completely read about it. record 1793: This bug
    *     can be solved easily record 1794: This ." 
    *    
    *    delimiter = "record";
    *        
    *    supposing:- String at the end of buffer =
    *    "I found this bug very interesting and I have completely re"
    *    There for next buffer = "ad about it. record 179       ...."           
    *     
    *     The matching characters in the input
    *     buffer tail and delimiter head = "re" 
    *     Therefore, ambiguous byte count = 2 ****   //
    *     
    *     2.1 If the following bytes are the remaining characters of
    *         the delimiter, then we have to capture only up to the starting 
    *         position of delimiter. That means, we need not include the 
    *         ambiguous characters in str.
    *     
    *     2.2 If the following bytes are not the remaining characters of
    *         the delimiter ( as mentioned in the example ), 
    *         then we have to include the ambiguous characters in str. 
    */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount=0; // To capture the ambiguous characters count
    do {
-      int startPosn = bufferPosn; // starting from where we left off the last
+      int startPosn = bufferPosn; // Start from previous end position
      // time
      if (bufferPosn >= bufferLength) {
        startPosn = bufferPosn = 0;
        bufferLength = in.read(buffer);
-        if (bufferLength <= 0)
+        if (bufferLength <= 0) {
          str.append(recordDelimiterBytes, 0, ambiguousByteCount);
          break; // EOF
        }
      }
      for (; bufferPosn < bufferLength; ++bufferPosn) {
        if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
          delPosn++;
@ -267,7 +309,7 @@ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
            break;
          }
        } else if (delPosn != 0) {
-          bufferPosn--; // recheck if bufferPosn matches start of delimiter
+          bufferPosn--;
          delPosn = 0;
        }
      }
@ -278,13 +320,26 @@ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
        appendLength = maxLineLength - txtLength;
      }
      if (appendLength > 0) {
        if (ambiguousByteCount > 0) {
          str.append(recordDelimiterBytes, 0, ambiguousByteCount);
          //appending the ambiguous characters (refer case 2.2)
          bytesConsumed += ambiguousByteCount;
          ambiguousByteCount=0;
        }
        str.append(buffer, startPosn, appendLength);
        txtLength += appendLength;
      }
      if (bufferPosn >= bufferLength) {
        if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
          ambiguousByteCount = delPosn;
          bytesConsumed -= ambiguousByteCount; //to be consumed in next
        }
      }
    } while (delPosn < recordDelimiterBytes.length 
        && bytesConsumed < maxBytesToConsume);
-    if (bytesConsumed > (long) Integer.MAX_VALUE)
+    if (bytesConsumed > (long) Integer.MAX_VALUE) {
      throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed; 
  }
@ -308,5 +363,4 @@ public int readLine(Text str, int maxLineLength) throws IOException {
  public int readLine(Text str) throws IOException {
    return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
  }
 }
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestLineReader.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestLineReader.java
@ -21,29 +21,121 @@
 import java.io.ByteArrayInputStream;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.LineReader;
 import org.junit.Test;
 import junit.framework.Assert;
 public class TestLineReader {
  private LineReader lineReader;
  private String TestData;
  private String Delimiter;
  private Text line;
  @Test
  public void testCustomDelimiter() throws Exception {
-    String data = "record Bangalorrecord recorrecordrecord Kerala";
+    /* TEST_1
-    String delimiter = "record";
+     * The test scenario is the tail of the buffer
-    LineReader reader = new LineReader(
+     * equals the starting character/s of delimiter
-        new ByteArrayInputStream(data.getBytes()),
+     * 
-        delimiter.getBytes());
+     * The Test Data is such that,
-    Text line = new Text();
+     *   
-    reader.readLine(line);
+     * 1) we will have "</entity>" as delimiter  
     *  
     * 2) The tail of the current buffer would be "</"
     *    which matches with the starting character sequence of delimiter.
     *    
     * 3) The Head of the next buffer would be   "id>" 
     *    which does NOT match with the remaining characters of delimiter.
     *   
     * 4) Input data would be prefixed by char 'a' 
     *    about numberOfCharToFillTheBuffer times.
     *    So that, one iteration to buffer the input data,
     *    would end at '</' ie equals starting 2 char of delimiter  
     *     
     * 5) For this we would take BufferSize as 64 * 1024;
     * 
     * Check Condition
     *  In the second key value pair, the value should contain 
     *  "</"  from currentToken and
     *  "id>" from next token 
     */  
    Delimiter="</entity>"; 
    String CurrentBufferTailToken=
        "</entity><entity><id>Gelesh</";
    // Ending part of Input Data Buffer
    // It contains '</' ie delimiter character 
    String NextBufferHeadToken=
        "id><name>Omathil</name></entity>";
    // Supposing the start of next buffer is this
    String Expected = 
        (CurrentBufferTailToken+NextBufferHeadToken)
        .replace(Delimiter, "");                       
    // Expected ,must capture from both the buffer, excluding Delimiter
    String TestPartOfInput = CurrentBufferTailToken+NextBufferHeadToken;
    int BufferSize=64 * 1024;
    int numberOfCharToFillTheBuffer=BufferSize-CurrentBufferTailToken.length();
    StringBuilder fillerString=new StringBuilder();
    for (int i=0;i<numberOfCharToFillTheBuffer;i++) {  
      fillerString.append('a'); // char 'a' as a filler for the test string
    }
    TestData = fillerString + TestPartOfInput;
    lineReader = new LineReader(
        new ByteArrayInputStream(TestData.getBytes()),Delimiter.getBytes());
    line = new Text();
    lineReader.readLine(line); 
    Assert.assertEquals(fillerString.toString(),line.toString());
    lineReader.readLine(line);
    Assert.assertEquals(Expected, line.toString());
    /*TEST_2
     * The test scenario is such that,
     * the character/s preceding the delimiter,
     * equals the starting character/s of delimiter
     */
    Delimiter = "record";
    StringBuilder TestStringBuilder = new StringBuilder();
    TestStringBuilder.append(Delimiter+"Kerala ");
    TestStringBuilder.append(Delimiter+"Bangalore");
    TestStringBuilder.append(Delimiter+" North Korea");
    TestStringBuilder.append(Delimiter+Delimiter+
                        "Guantanamo");
    TestStringBuilder.append(Delimiter+"ecord"+"recor"+"core"); //~EOF with 're'
    TestData=TestStringBuilder.toString();
    lineReader = new LineReader(
        new ByteArrayInputStream(TestData.getBytes()),Delimiter.getBytes());
    lineReader.readLine(line); 
    Assert.assertEquals("",line.toString());
-    reader.readLine(line);
+    lineReader.readLine(line); 
    Assert.assertEquals(" Bangalor", line.toString());
    reader.readLine(line);
    Assert.assertEquals(" recor", line.toString());
    reader.readLine(line);
    Assert.assertEquals("", line.toString());
    reader.readLine(line);
    Assert.assertEquals("Kerala ",line.toString());
    lineReader.readLine(line); 
    Assert.assertEquals("Bangalore",line.toString());
    lineReader.readLine(line); 
    Assert.assertEquals(" North Korea",line.toString());
    lineReader.readLine(line); 
    Assert.assertEquals("",line.toString());
    lineReader.readLine(line); 
    Assert.assertEquals("Guantanamo",line.toString());
    lineReader.readLine(line); 
    Assert.assertEquals(("ecord"+"recor"+"core"),line.toString());
  }
 }