HADOOP-18456. NullPointerException in ObjectListingIterator. (#4909)
This problem surfaced in impala integration tests IMPALA-11592. TestLocalCatalogRetries.test_fetch_metadata_retry fails in S3 build after the change HADOOP-17461. Add thread-level IOStatistics Context The actual GC race condition came with HADOOP-18091. S3A auditing leaks memory through ThreadLocal references The fix for this is, if our hypothesis is correct, in WeakReferenceMap.create() where a strong reference to the new value is kept in a local variable *and referred to later* so that the JVM will not GC it. Along with the fix, extra assertions ensure that if the problem is not fixed, applications will fail faster/more meaningfully. Contributed by Steve Loughran.
This commit is contained in:
parent
9a29075f91
commit
0676495950
@ -25,6 +25,8 @@
|
|||||||
|
|
||||||
import org.apache.hadoop.util.WeakReferenceMap;
|
import org.apache.hadoop.util.WeakReferenceMap;
|
||||||
|
|
||||||
|
import static java.util.Objects.requireNonNull;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A WeakReferenceMap for threads.
|
* A WeakReferenceMap for threads.
|
||||||
* @param <V> value type of the map
|
* @param <V> value type of the map
|
||||||
@ -36,30 +38,55 @@ public WeakReferenceThreadMap(final Function<? super Long, ? extends V> factory,
|
|||||||
super(factory, referenceLost);
|
super(factory, referenceLost);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the value for the current thread, creating if needed.
|
||||||
|
* @return an instance.
|
||||||
|
*/
|
||||||
public V getForCurrentThread() {
|
public V getForCurrentThread() {
|
||||||
return get(currentThreadId());
|
return get(currentThreadId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove the reference for the current thread.
|
||||||
|
* @return any reference value which existed.
|
||||||
|
*/
|
||||||
public V removeForCurrentThread() {
|
public V removeForCurrentThread() {
|
||||||
return remove(currentThreadId());
|
return remove(currentThreadId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the current thread ID.
|
||||||
|
* @return thread ID.
|
||||||
|
*/
|
||||||
public long currentThreadId() {
|
public long currentThreadId() {
|
||||||
return Thread.currentThread().getId();
|
return Thread.currentThread().getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the new value for the current thread.
|
||||||
|
* @param newVal new reference to set for the active thread.
|
||||||
|
* @return the previously set value, possibly null
|
||||||
|
*/
|
||||||
public V setForCurrentThread(V newVal) {
|
public V setForCurrentThread(V newVal) {
|
||||||
|
requireNonNull(newVal);
|
||||||
long id = currentThreadId();
|
long id = currentThreadId();
|
||||||
|
|
||||||
// if the same object is already in the map, just return it.
|
// if the same object is already in the map, just return it.
|
||||||
WeakReference<V> ref = lookup(id);
|
WeakReference<V> existingWeakRef = lookup(id);
|
||||||
// Reference value could be set to null. Thus, ref.get() could return
|
|
||||||
// null. Should be handled accordingly while using the returned value.
|
|
||||||
if (ref != null && ref.get() == newVal) {
|
|
||||||
return ref.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// The looked up reference could be one of
|
||||||
|
// 1. null: nothing there
|
||||||
|
// 2. valid but get() == null : reference lost by GC.
|
||||||
|
// 3. different from the new value
|
||||||
|
// 4. the same as the old value
|
||||||
|
if (resolve(existingWeakRef) == newVal) {
|
||||||
|
// case 4: do nothing, return the new value
|
||||||
|
return newVal;
|
||||||
|
} else {
|
||||||
|
// cases 1, 2, 3: update the map and return the old value
|
||||||
return put(id, newVal);
|
return put(id, newVal);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -20,6 +20,8 @@
|
|||||||
|
|
||||||
import org.apache.hadoop.fs.statistics.impl.IOStatisticsContextIntegration;
|
import org.apache.hadoop.fs.statistics.impl.IOStatisticsContextIntegration;
|
||||||
|
|
||||||
|
import static java.util.Objects.requireNonNull;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An interface defined to capture thread-level IOStatistics by using per
|
* An interface defined to capture thread-level IOStatistics by using per
|
||||||
* thread context.
|
* thread context.
|
||||||
@ -67,7 +69,11 @@ public interface IOStatisticsContext extends IOStatisticsSource {
|
|||||||
* @return instance of IOStatisticsContext for the context.
|
* @return instance of IOStatisticsContext for the context.
|
||||||
*/
|
*/
|
||||||
static IOStatisticsContext getCurrentIOStatisticsContext() {
|
static IOStatisticsContext getCurrentIOStatisticsContext() {
|
||||||
return IOStatisticsContextIntegration.getCurrentIOStatisticsContext();
|
// the null check is just a safety check to highlight exactly where a null value would
|
||||||
|
// be returned if HADOOP-18456 has resurfaced.
|
||||||
|
return requireNonNull(
|
||||||
|
IOStatisticsContextIntegration.getCurrentIOStatisticsContext(),
|
||||||
|
"Null IOStatisticsContext");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -100,7 +100,10 @@ private IOStatisticsContextIntegration() {}
|
|||||||
* @return an instance of IOStatisticsContext.
|
* @return an instance of IOStatisticsContext.
|
||||||
*/
|
*/
|
||||||
private static IOStatisticsContext createNewInstance(Long key) {
|
private static IOStatisticsContext createNewInstance(Long key) {
|
||||||
return new IOStatisticsContextImpl(key, INSTANCE_ID.getAndIncrement());
|
IOStatisticsContextImpl instance =
|
||||||
|
new IOStatisticsContextImpl(key, INSTANCE_ID.getAndIncrement());
|
||||||
|
LOG.debug("Created instance {}", instance);
|
||||||
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -131,9 +134,11 @@ public static void setThreadIOStatisticsContext(
|
|||||||
IOStatisticsContext statisticsContext) {
|
IOStatisticsContext statisticsContext) {
|
||||||
if (isThreadIOStatsEnabled) {
|
if (isThreadIOStatsEnabled) {
|
||||||
if (statisticsContext == null) {
|
if (statisticsContext == null) {
|
||||||
|
// new value is null, so remove it
|
||||||
ACTIVE_IOSTATS_CONTEXT.removeForCurrentThread();
|
ACTIVE_IOSTATS_CONTEXT.removeForCurrentThread();
|
||||||
}
|
} else {
|
||||||
if (ACTIVE_IOSTATS_CONTEXT.getForCurrentThread() != statisticsContext) {
|
// the setter is efficient in that it does not create a new
|
||||||
|
// reference if the context is unchanged.
|
||||||
ACTIVE_IOSTATS_CONTEXT.setForCurrentThread(statisticsContext);
|
ACTIVE_IOSTATS_CONTEXT.setForCurrentThread(statisticsContext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,7 +28,11 @@
|
|||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
|
import org.apache.hadoop.fs.store.LogExactlyOnce;
|
||||||
|
|
||||||
import static java.util.Objects.requireNonNull;
|
import static java.util.Objects.requireNonNull;
|
||||||
|
|
||||||
@ -52,6 +56,9 @@
|
|||||||
@InterfaceAudience.Private
|
@InterfaceAudience.Private
|
||||||
public class WeakReferenceMap<K, V> {
|
public class WeakReferenceMap<K, V> {
|
||||||
|
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(WeakReferenceMap.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The reference map.
|
* The reference map.
|
||||||
*/
|
*/
|
||||||
@ -79,6 +86,12 @@ public class WeakReferenceMap<K, V> {
|
|||||||
*/
|
*/
|
||||||
private final AtomicLong entriesCreatedCount = new AtomicLong();
|
private final AtomicLong entriesCreatedCount = new AtomicLong();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log to report loss of a reference during the create phase, which
|
||||||
|
* is believed to be a cause of HADOOP-18456.
|
||||||
|
*/
|
||||||
|
private final LogExactlyOnce referenceLostDuringCreation = new LogExactlyOnce(LOG);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* instantiate.
|
* instantiate.
|
||||||
* @param factory supplier of new instances
|
* @param factory supplier of new instances
|
||||||
@ -132,35 +145,93 @@ public WeakReference<V> lookup(K key) {
|
|||||||
* @return an instance.
|
* @return an instance.
|
||||||
*/
|
*/
|
||||||
public V get(K key) {
|
public V get(K key) {
|
||||||
final WeakReference<V> current = lookup(key);
|
final WeakReference<V> currentWeakRef = lookup(key);
|
||||||
V val = resolve(current);
|
// resolve it, after which if not null, we have a strong reference
|
||||||
if (val != null) {
|
V strongVal = resolve(currentWeakRef);
|
||||||
|
if (strongVal != null) {
|
||||||
// all good.
|
// all good.
|
||||||
return val;
|
return strongVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
// here, either no ref, or the value is null
|
// here, either currentWeakRef was null, or its reference was GC'd.
|
||||||
if (current != null) {
|
if (currentWeakRef != null) {
|
||||||
|
// garbage collection removed the reference.
|
||||||
|
|
||||||
|
// explicitly remove the weak ref from the map if it has not
|
||||||
|
// been updated by this point
|
||||||
|
// this is here just for completeness.
|
||||||
|
map.remove(key, currentWeakRef);
|
||||||
|
|
||||||
|
// log/report the loss.
|
||||||
noteLost(key);
|
noteLost(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// create a new value and add it to the map
|
||||||
return create(key);
|
return create(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new instance under a key.
|
* Create a new instance under a key.
|
||||||
|
* <p>
|
||||||
* The instance is created, added to the map and then the
|
* The instance is created, added to the map and then the
|
||||||
* map value retrieved.
|
* map value retrieved.
|
||||||
* This ensures that the reference returned is that in the map,
|
* This ensures that the reference returned is that in the map,
|
||||||
* even if there is more than one entry being created at the same time.
|
* even if there is more than one entry being created at the same time.
|
||||||
|
* If that race does occur, it will be logged the first time it happens
|
||||||
|
* for this specific map instance.
|
||||||
|
* <p>
|
||||||
|
* HADOOP-18456 highlighted the risk of a concurrent GC resulting a null
|
||||||
|
* value being retrieved and so returned.
|
||||||
|
* To prevent this:
|
||||||
|
* <ol>
|
||||||
|
* <li>A strong reference is retained to the newly created instance
|
||||||
|
* in a local variable.</li>
|
||||||
|
* <li>That variable is used after the resolution process, to ensure
|
||||||
|
* the JVM doesn't consider it "unreachable" and so eligible for GC.</li>
|
||||||
|
* <li>A check is made for the resolved reference being null, and if so,
|
||||||
|
* the put() is repeated</li>
|
||||||
|
* </ol>
|
||||||
* @param key key
|
* @param key key
|
||||||
* @return the value
|
* @return the created value
|
||||||
*/
|
*/
|
||||||
public V create(K key) {
|
public V create(K key) {
|
||||||
entriesCreatedCount.incrementAndGet();
|
entriesCreatedCount.incrementAndGet();
|
||||||
WeakReference<V> newRef = new WeakReference<>(
|
/*
|
||||||
requireNonNull(factory.apply(key)));
|
Get a strong ref so even if a GC happens in this method the reference is not lost.
|
||||||
map.put(key, newRef);
|
It is NOT enough to have a reference in a field, it MUST be used
|
||||||
return map.get(key).get();
|
so as to ensure the reference isn't optimized away prematurely.
|
||||||
|
"A reachable object is any object that can be accessed in any potential continuing
|
||||||
|
computation from any live thread."
|
||||||
|
*/
|
||||||
|
|
||||||
|
final V strongRef = requireNonNull(factory.apply(key),
|
||||||
|
"factory returned a null instance");
|
||||||
|
V resolvedStrongRef;
|
||||||
|
do {
|
||||||
|
WeakReference<V> newWeakRef = new WeakReference<>(strongRef);
|
||||||
|
|
||||||
|
// put it in the map
|
||||||
|
map.put(key, newWeakRef);
|
||||||
|
|
||||||
|
// get it back from the map
|
||||||
|
WeakReference<V> retrievedWeakRef = map.get(key);
|
||||||
|
// resolve that reference, handling the situation where somehow it was removed from the map
|
||||||
|
// between the put() and the get()
|
||||||
|
resolvedStrongRef = resolve(retrievedWeakRef);
|
||||||
|
if (resolvedStrongRef == null) {
|
||||||
|
referenceLostDuringCreation.warn("reference to %s lost during creation", key);
|
||||||
|
noteLost(key);
|
||||||
|
}
|
||||||
|
} while (resolvedStrongRef == null);
|
||||||
|
|
||||||
|
// note if there was any change in the reference.
|
||||||
|
// as this forces strongRef to be kept in scope
|
||||||
|
if (strongRef != resolvedStrongRef) {
|
||||||
|
LOG.debug("Created instance for key {}: {} overwritten by {}",
|
||||||
|
key, strongRef, resolvedStrongRef);
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolvedStrongRef;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -203,7 +274,7 @@ public boolean containsKey(K key) {
|
|||||||
* @param r reference to resolve
|
* @param r reference to resolve
|
||||||
* @return the value or null
|
* @return the value or null
|
||||||
*/
|
*/
|
||||||
private V resolve(WeakReference<V> r) {
|
protected V resolve(WeakReference<V> r) {
|
||||||
return r == null ? null : r.get();
|
return r == null ? null : r.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,7 +304,7 @@ public int prune() {
|
|||||||
* @param key key of lost reference
|
* @param key key of lost reference
|
||||||
*/
|
*/
|
||||||
private void noteLost(final K key) {
|
private void noteLost(final K key) {
|
||||||
// incrment local counter
|
// increment local counter
|
||||||
referenceLostCount.incrementAndGet();
|
referenceLostCount.incrementAndGet();
|
||||||
|
|
||||||
// and call any notification function supplied in the constructor
|
// and call any notification function supplied in the constructor
|
||||||
|
@ -20,15 +20,19 @@
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
|
||||||
import org.assertj.core.api.Assertions;
|
import org.assertj.core.api.Assertions;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.impl.WeakReferenceThreadMap;
|
||||||
import org.apache.hadoop.test.AbstractHadoopTestBase;
|
import org.apache.hadoop.test.AbstractHadoopTestBase;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test {@link WeakReferenceMap}.
|
* Test {@link WeakReferenceMap} and {@link WeakReferenceThreadMap}.
|
||||||
* There's no attempt to force GC here, so the tests are
|
* There's no attempt to force GC here, so the tests are
|
||||||
* more about the basic behavior not the handling of empty references.
|
* more about the basic behavior not the handling of empty references.
|
||||||
*/
|
*/
|
||||||
@ -125,11 +129,92 @@ public void testDemandCreateEntries() {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* It is an error to have a factory which returns null.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testFactoryReturningNull() throws Throwable {
|
||||||
|
referenceMap = new WeakReferenceMap<>(
|
||||||
|
(k) -> null,
|
||||||
|
null);
|
||||||
|
intercept(NullPointerException.class, () ->
|
||||||
|
referenceMap.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the WeakReferenceThreadMap extension.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testWeakReferenceThreadMapAssignment()
|
||||||
|
throws Throwable {
|
||||||
|
|
||||||
|
// counters foor the callbacks
|
||||||
|
final AtomicLong created = new AtomicLong();
|
||||||
|
final AtomicLong lost = new AtomicLong();
|
||||||
|
|
||||||
|
WeakReferenceThreadMap<String> threadMap = new WeakReferenceThreadMap<>(
|
||||||
|
id -> "Entry for thread ID " + id + " (" + created.incrementAndGet() + ")",
|
||||||
|
id -> lost.incrementAndGet());
|
||||||
|
|
||||||
|
Assertions.assertThat(threadMap.setForCurrentThread("hello"))
|
||||||
|
.describedAs("current thread map value on first set")
|
||||||
|
.isNull();
|
||||||
|
|
||||||
|
// second attempt returns itself
|
||||||
|
Assertions.assertThat(threadMap.setForCurrentThread("hello"))
|
||||||
|
.describedAs("current thread map value on second set")
|
||||||
|
.isEqualTo("hello");
|
||||||
|
|
||||||
|
// it is forbidden to explicitly set to null via the set() call.
|
||||||
|
intercept(NullPointerException.class, () ->
|
||||||
|
threadMap.setForCurrentThread(null));
|
||||||
|
|
||||||
|
// the map is unchanged
|
||||||
|
Assertions.assertThat(threadMap.getForCurrentThread())
|
||||||
|
.describedAs("current thread map value")
|
||||||
|
.isEqualTo("hello");
|
||||||
|
|
||||||
|
// remove the value and assert what the removed entry was
|
||||||
|
Assertions.assertThat(threadMap.removeForCurrentThread())
|
||||||
|
.describedAs("removed thread map value")
|
||||||
|
.isEqualTo("hello");
|
||||||
|
|
||||||
|
// remove the value again; this time the removed value is null
|
||||||
|
Assertions.assertThat(threadMap.removeForCurrentThread())
|
||||||
|
.describedAs("removed thread map value on second call")
|
||||||
|
.isNull();
|
||||||
|
|
||||||
|
// lookup will return a new instance created by the factory
|
||||||
|
long c1 = created.get();
|
||||||
|
String dynamicValue = threadMap.getForCurrentThread();
|
||||||
|
Assertions.assertThat(dynamicValue)
|
||||||
|
.describedAs("dynamically created thread map value")
|
||||||
|
.startsWith("Entry for thread ID")
|
||||||
|
.contains("(" + (c1 + 1) + ")");
|
||||||
|
|
||||||
|
// and we can overwrite that
|
||||||
|
Assertions.assertThat(threadMap.setForCurrentThread("hello2"))
|
||||||
|
.describedAs("value before the thread entry is changed")
|
||||||
|
.isEqualTo(dynamicValue);
|
||||||
|
|
||||||
|
// simulate a weak gc
|
||||||
|
long threadId = threadMap.currentThreadId();
|
||||||
|
threadMap.put(threadId, null);
|
||||||
|
String updated = threadMap.getForCurrentThread();
|
||||||
|
Assertions.assertThat(lost.get())
|
||||||
|
.describedAs("lost count")
|
||||||
|
.isEqualTo(1);
|
||||||
|
Assertions.assertThat(updated)
|
||||||
|
.describedAs("dynamically created thread map value")
|
||||||
|
.startsWith("Entry for thread ID")
|
||||||
|
.contains("(" + (c1 + 2) + ")");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Assert that the value of a map entry is as expected.
|
* Assert that the value of a map entry is as expected.
|
||||||
* Will trigger entry creation if the key is absent.
|
* Will trigger entry creation if the key is absent.
|
||||||
* @param key key
|
* @param key key
|
||||||
* @param val expected valued
|
* @param val expected value
|
||||||
*/
|
*/
|
||||||
private void assertMapEntryEquals(int key, String val) {
|
private void assertMapEntryEquals(int key, String val) {
|
||||||
Assertions.assertThat(referenceMap.get(key))
|
Assertions.assertThat(referenceMap.get(key))
|
||||||
@ -143,7 +228,7 @@ private void assertMapEntryEquals(int key, String val) {
|
|||||||
*/
|
*/
|
||||||
private void assertMapContainsKey(int key) {
|
private void assertMapContainsKey(int key) {
|
||||||
Assertions.assertThat(referenceMap.containsKey(key))
|
Assertions.assertThat(referenceMap.containsKey(key))
|
||||||
.describedAs("map enty of key %d should be present", key)
|
.describedAs("map entry of key %d should be present", key)
|
||||||
.isTrue();
|
.isTrue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user