HADOOP-14089. Automated checking for malformed client. Contributed by Sean Busbey.

This commit is contained in:
Andrew Wang 2017-09-13 16:57:50 -07:00
parent bb34ae9554
commit c3f35c422b
8 changed files with 438 additions and 9 deletions

View File

@ -182,6 +182,21 @@
<exclude>io/serializations</exclude>
</excludes>
</relocation>
<!-- JSRs that haven't made it to inclusion in J2SE -->
<relocation>
<pattern>javax/el/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.el.</shadedPattern>
<excludes>
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/cache/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.cache.</shadedPattern>
<excludes>
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/servlet/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.servlet.</shadedPattern>
@ -189,6 +204,13 @@
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/ws/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.ws.</shadedPattern>
<excludes>
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>net/</pattern>
<shadedPattern>${shaded.dependency.prefix}.net.</shadedPattern>
@ -199,6 +221,11 @@
<exclude>net/topology/**/*</exclude>
</excludes>
</relocation>
<!-- okio declares a top level package instead of nested -->
<relocation>
<pattern>okio/</pattern>
<shadedPattern>${shaded.dependency.prefix}.okio.</shadedPattern>
</relocation>
</relocations>
<transformers>
<!-- Needed until MSHADE-182 -->

View File

@ -25,7 +25,13 @@
<version>3.1.0-SNAPSHOT</version>
<packaging>pom</packaging>
<description>Enforces our invariants for the api and runtime client modules.</description>
<description>
Enforces our invariants for the api and runtime client modules.
E.g. that modules have a specific set of transitive dependencies
and shaded artifacts only contain classes that are in particular
packages. Does the enforcement through the maven-enforcer-plugin
and an integration test.
</description>
<name>Apache Hadoop Client Packaging Invariants</name>
<properties>
@ -82,6 +88,8 @@
<exclude>commons-logging:commons-logging</exclude>
<!-- Leave log4j unshaded so downstream users can configure logging. -->
<exclude>log4j:log4j</exclude>
<!-- Leave javax annotations we need exposed -->
<exclude>com.google.code.findbugs:jsr305</exclude>
</excludes>
</banTransitiveDependencies>
<banDuplicateClasses>
@ -97,7 +105,6 @@
</dependencies>
</banDuplicateClasses>
</rules>
<!-- TODO we need a rule for "we don't have classes that are outside of the org.apache.hadoop package" -->
<!-- TODO we need a rule for "the constants in this set of classes haven't been shaded / don't have this prefix"
Manually checking the set of Keys that look like packages we relocate:
@ -116,6 +123,69 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>test-resources</id>
<phase>pre-integration-test</phase>
<goals>
<goal>testResources</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<!-- create a maven pom property that has all of our dependencies.
below in the integration-test phase we'll pass this list
of paths to our jar checker script.
-->
<execution>
<id>put-client-artifacts-in-a-property</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build-classpath</goal>
</goals>
<configuration>
<excludeTransitive>true</excludeTransitive>
<outputProperty>hadoop-client-artifacts</outputProperty>
</configuration>
</execution>
</executions>
</plugin>
<!--
Check that we actually relocated everything we included.
It's critical that we don't ship third party dependencies that haven't
been relocated under our pacakge space, since this will lead to
difficult to debug classpath errors for downstream. Unfortunately, that
means inspecting all the jars.
-->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>check-jar-contents</id>
<phase>integration-test</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${shell-executable}</executable>
<workingDirectory>${project.build.testOutputDirectory}</workingDirectory>
<requiresOnline>false</requiresOnline>
<arguments>
<argument>ensure-jars-have-correct-contents.sh</argument>
<argument>${hadoop-client-artifacts}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

View File

@ -0,0 +1,82 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Usage: $0 [/path/to/some/example.jar:/path/to/another/example/created.jar]
#
# accepts a single command line argument with a colon separated list of
# paths to jars to check. Iterates through each such passed jar and checks
# all the contained paths to make sure they follow the below constructed
# safe list.
# we have to allow the directories that lead to the org/apache/hadoop dir
allowed_expr="(^org/$|^org/apache/$"
# We allow the following things to exist in our client artifacts:
# * classes in packages that start with org.apache.hadoop, which by
# convention should be in a path that looks like org/apache/hadoop
allowed_expr+="|^org/apache/hadoop/"
# * whatever in the "META-INF" directory
allowed_expr+="|^META-INF/"
# * whatever under the "webapps" directory; for things shipped by yarn
allowed_expr+="|^webapps/"
# * Hadoop's default configuration files, which have the form
# "_module_-default.xml"
allowed_expr+="|^[^-]*-default.xml$"
# * Hadoop's versioning properties files, which have the form
# "_module_-version-info.properties"
allowed_expr+="|^[^-]*-version-info.properties$"
# * Hadoop's application classloader properties file.
allowed_expr+="|^org.apache.hadoop.application-classloader.properties$"
# public suffix list used by httpcomponents
allowed_expr+="|^mozilla/$"
allowed_expr+="|^mozilla/public-suffix-list.txt$"
# Comes from commons-configuration, not sure if relocatable.
allowed_expr+="|^properties.dtd$"
allowed_expr+="|^PropertyList-1.0.dtd$"
# Comes from Ehcache, not relocatable at top level due to limitation
# of shade plugin AFAICT
allowed_expr+="|^ehcache-core.xsd$"
allowed_expr+="|^ehcache-107ext.xsd$"
# Comes from kerby's kerb-simplekdc, not relocatable since at top level
allowed_expr+="|^krb5-template.conf$"
allowed_expr+="|^krb5_udp-template.conf$"
# Jetty uses this style sheet for directory listings. TODO ensure our
# internal use of jetty disallows directory listings and remove this.
allowed_expr+="|^jetty-dir.css$"
allowed_expr+=")"
declare -i bad_artifacts=0
declare -a bad_contents
IFS=: read -r -d '' -a artifact_list < <(printf '%s\0' "$1")
for artifact in "${artifact_list[@]}"; do
bad_contents=($(jar tf "${artifact}" | grep -v -E "${allowed_expr}"))
if [ ${#bad_contents[@]} -gt 0 ]; then
echo "[ERROR] Found artifact with unexpected contents: '${artifact}'"
echo " Please check the following and either correct the build or update"
echo " the allowed list with reasoning."
echo ""
for bad_line in "${bad_contents[@]}"; do
echo " ${bad_line}"
done
bad_artifacts=${bad_artifacts}+1
else
echo "[INFO] Artifact looks correct: '$(basename "${artifact}")'"
fi
done
if [ "${bad_artifacts}" -gt 0 ]; then
exit 1
fi

View File

@ -25,7 +25,13 @@
<version>3.1.0-SNAPSHOT</version>
<packaging>pom</packaging>
<description>Enforces our invariants for the testing client modules.</description>
<description>
Enforces our invariants for the test client modules.
E.g. that modules have a specific set of transitive dependencies
and shaded artifacts only contain classes that are in particular
packages. Does the enforcement through the maven-enforcer-plugin
and an integration test.
</description>
<name>Apache Hadoop Client Packaging Invariants for Test</name>
<properties>
@ -90,6 +96,8 @@
<exclude>junit:junit</exclude>
<!-- JUnit brings in hamcrest -->
<exclude> org.hamcrest:hamcrest-core</exclude>
<!-- Leave javax annotations we need exposed -->
<exclude>com.google.code.findbugs:jsr305</exclude>
</excludes>
</banTransitiveDependencies>
<banDuplicateClasses>
@ -105,7 +113,6 @@
</dependencies>
</banDuplicateClasses>
</rules>
<!-- TODO we need a rule for "we don't have classes that are outside of the org.apache.hadoop package" -->
<!-- TODO we need a rule for "the constants in this set of classes haven't been shaded / don't have this prefix"
Manually checking the set of Keys that look like packages we relocate:
@ -124,6 +131,71 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>test-resources</id>
<phase>pre-integration-test</phase>
<goals>
<goal>testResources</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- create a maven pom property that has all of our dependencies.
below in the integration-test phase we'll pass this list
of paths to our jar checker script.
-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>put-client-artifacts-in-a-property</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build-classpath</goal>
</goals>
<configuration>
<!-- these two get covered in our non-test invariant check -->
<excludeArtifactIds>hadoop-client-api,hadoop-client-runtime</excludeArtifactIds>
<excludeTransitive>true</excludeTransitive>
<outputProperty>hadoop-client-artifacts</outputProperty>
</configuration>
</execution>
</executions>
</plugin>
<!--
Check that we actually relocated everything we included.
It's critical that we don't ship third party dependencies that haven't
been relocated under our pacakge space, since this will lead to
difficult to debug classpath errors for downstream. Unfortunately, that
means inspecting all the jars.
-->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>check-jar-contents</id>
<phase>integration-test</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${shell-executable}</executable>
<workingDirectory>${project.build.testOutputDirectory}</workingDirectory>
<requiresOnline>false</requiresOnline>
<arguments>
<argument>ensure-jars-have-correct-contents.sh</argument>
<argument>${hadoop-client-artifacts}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

View File

@ -0,0 +1,70 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Usage: $0 [/path/to/some/example.jar:/path/to/another/example/created.jar]
#
# accepts a single command line argument with a colon separated list of
# paths to jars to check. Iterates through each such passed jar and checks
# all the contained paths to make sure they follow the below constructed
# safe list.
# we have to allow the directories that lead to the org/apache/hadoop dir
allowed_expr="(^org/$|^org/apache/$"
# We allow the following things to exist in our client artifacts:
# * classes in packages that start with org.apache.hadoop, which by
# convention should be in a path that looks like org/apache/hadoop
allowed_expr+="|^org/apache/hadoop/"
# * whatever in the "META-INF" directory
allowed_expr+="|^META-INF/"
# * whatever under the "webapps" directory; for minicluster UIs
allowed_expr+="|^webapps/"
# * Hadoop's default configuration files, which have the form
# "_module_-default.xml"
allowed_expr+="|^[^-]*-default.xml$"
# * Hadoop's versioning properties files, which have the form
# "_module_-version-info.properties"
allowed_expr+="|^[^-]*-version-info.properties$"
# * Hadoop's application classloader properties file.
allowed_expr+="|^org.apache.hadoop.application-classloader.properties$"
# * Used by JavaSandboxLinuxContainerRuntime as a default, loaded
# from root, so can't relocate. :(
allowed_expr+="|^java.policy$"
allowed_expr+=")"
declare -i bad_artifacts=0
declare -a bad_contents
IFS=: read -r -d '' -a artifact_list < <(printf '%s\0' "$1")
for artifact in "${artifact_list[@]}"; do
bad_contents=($(jar tf "${artifact}" | grep -v -E "${allowed_expr}"))
if [ ${#bad_contents[@]} -gt 0 ]; then
echo "[ERROR] Found artifact with unexpected contents: '${artifact}'"
echo " Please check the following and either correct the build or update"
echo " the allowed list with reasoning."
echo ""
for bad_line in "${bad_contents[@]}"; do
echo " ${bad_line}"
done
bad_artifacts=${bad_artifacts}+1
else
echo "[INFO] Artifact looks correct: '$(basename "${artifact}")'"
fi
done
if [ "${bad_artifacts}" -gt 0 ]; then
exit 1
fi

View File

@ -348,11 +348,6 @@
<artifactId>jersey-servlet</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.eclipse.jdt</groupId>
<artifactId>core</artifactId>
<optional>true</optional>
</dependency>
<!-- skip org.apache.avro:avro-ipc because it doesn't look like hadoop-common actually uses it -->
<dependency>
<groupId>net.sf.kosmosfs</groupId>
@ -595,6 +590,7 @@
<exclude>org.slf4j:slf4j-api</exclude>
<exclude>commons-logging:commons-logging</exclude>
<exclude>junit:junit</exclude>
<exclude>com.google.code.findbugs:jsr305</exclude>
<!-- Keep optional runtime deps out of the shading -->
<exclude>org.apache.hadoop:hadoop-yarn-server-timelineservice</exclude>
<exclude>log4j:log4j</exclude>
@ -656,6 +652,41 @@
<exclude>org/hamcrest/*.class</exclude>
</excludes>
</filter>
<!-- skip grizzly internals we don't need to run. -->
<filter>
<artifact>org.glassfish.grizzly:grizzly-http-servlet</artifact>
<excludes>
<exclude>catalog.cat</exclude>
<exclude>javaee_5.xsd</exclude>
<exclude>javaee_6.xsd</exclude>
<exclude>javaee_web_services_client_1_2.xsd</exclude>
<exclude>javaee_web_services_client_1_3.xsd</exclude>
<exclude>jsp_2_1.xsd</exclude>
<exclude>jsp_2_2.xsd</exclude>
<exclude>web-app_2_5.xsd</exclude>
<exclude>web-app_3_0.xsd</exclude>
<exclude>web-common_3_0.xsd</exclude>
<exclude>xml.xsd</exclude>
</excludes>
</filter>
<filter>
<!-- skip jetty license info already incorporated into LICENSE/NOTICE -->
<artifact>org.eclipse.jetty:*</artifact>
<excludes>
<exclude>about.html</exclude>
</excludes>
</filter>
<filter>
<artifact>org.apache.hadoop:*</artifact>
<excludes>
<!-- No shipping log4j configs in a downstream facing library -->
<exclude>log4j.properties</exclude>
<exclude>container-log4j.properties</exclude>
<!-- keep optional runtime configuration out of the jar; downstream can provide -->
<exclude>capacity-scheduler.xml</exclude>
<exclude>krb5.conf</exclude>
</excludes>
</filter>
</filters>
<relocations>
<relocation>
@ -738,6 +769,7 @@
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<!-- JSRs that haven't made it to inclusion in J2SE -->
<relocation>
<pattern>javax/el/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.el.</shadedPattern>
@ -745,6 +777,13 @@
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/cache/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.cache.</shadedPattern>
<excludes>
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/inject/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.inject.</shadedPattern>
@ -759,6 +798,13 @@
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/ws/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.ws.</shadedPattern>
<excludes>
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>jersey/</pattern>
<shadedPattern>${shaded.dependency.prefix}.jersey.</shadedPattern>
@ -776,6 +822,11 @@
<exclude>net/topology/**/*</exclude>
</excludes>
</relocation>
<!-- okio declares a top level package instead of nested -->
<relocation>
<pattern>okio/</pattern>
<shadedPattern>${shaded.dependency.prefix}.okio.</shadedPattern>
</relocation>
</relocations>
<transformers>
<!-- Needed until MSHADE-182 -->

View File

@ -94,6 +94,11 @@
<artifactId>commons-logging</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.google.code.findbugs</groupId>
<artifactId>jsr305</artifactId>
<scope>runtime</scope>
</dependency>
<!-- Move log4j to optional, since it is needed for some pieces folks might not use:
* one of the three custom log4j appenders we have
-->
@ -149,6 +154,9 @@
<exclude>commons-logging:commons-logging</exclude>
<!-- Leave log4j unshaded so downstream users can configure logging. -->
<exclude>log4j:log4j</exclude>
<!-- Leave javax APIs that are stable -->
<!-- the jdk ships part of the javax.annotation namespace, so if we want to relocate this we'll have to care it out by class :( -->
<exclude>com.google.code.findbugs:jsr305</exclude>
</excludes>
</artifactSet>
<filters>
@ -181,6 +189,28 @@
<exclude>META-INF/services/javax.*</exclude>
</excludes>
</filter>
<filter>
<!-- skip french localization -->
<artifact>org.apache.commons:commons-math3</artifact>
<excludes>
<exclude>assets/org/apache/commons/math3/**/*</exclude>
</excludes>
</filter>
<filter>
<!-- skip jetty license info already incorporated into LICENSE/NOTICE -->
<artifact>org.eclipse.jetty:*</artifact>
<excludes>
<exclude>about.html</exclude>
</excludes>
</filter>
<filter>
<!-- skip docs on formats used in kerby -->
<artifact>org.apache.kerby:kerb-util</artifact>
<excludes>
<exclude>keytab.txt</exclude>
<exclude>ccache.txt</exclude>
</excludes>
</filter>
</filters>
<relocations>
<relocation>
@ -245,6 +275,7 @@
<exclude>io/serializations</exclude>
</excludes>
</relocation>
<!-- JSRs that haven't made it to inclusion in J2SE -->
<relocation>
<pattern>javax/el/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.el.</shadedPattern>
@ -252,6 +283,13 @@
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/cache/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.cache.</shadedPattern>
<excludes>
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/servlet/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.servlet.</shadedPattern>
@ -259,6 +297,13 @@
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>javax/ws/</pattern>
<shadedPattern>${shaded.dependency.prefix}.javax.ws.</shadedPattern>
<excludes>
<exclude>**/pom.xml</exclude>
</excludes>
</relocation>
<relocation>
<pattern>net/</pattern>
<shadedPattern>${shaded.dependency.prefix}.net.</shadedPattern>
@ -269,6 +314,11 @@
<exclude>net/topology/**/*</exclude>
</excludes>
</relocation>
<!-- okio declares a top level package instead of nested -->
<relocation>
<pattern>okio/</pattern>
<shadedPattern>${shaded.dependency.prefix}.okio.</shadedPattern>
</relocation>
<!-- probably not. -->
<!--
<relocation>

View File

@ -35,6 +35,13 @@
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-common</artifactId>
<!-- Not needed for client side -->
<exclusions>
<exclusion>
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>mssql-jdbc</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>