From 80705e034b7fc7ad384f1aa0bd15fc254ea06cf0 Mon Sep 17 00:00:00 2001 From: Tsuyoshi Ozawa Date: Wed, 4 Feb 2015 01:26:31 +0900 Subject: [PATCH] HADOOP-11045. Introducing a tool to detect flaky tests of hadoop jenkins testing job. Contributed by Yongjun Zhang and Todd Lipcon. --- dev-support/determine-flaky-tests-hadoop.py | 204 ++++++++++++++++++ .../hadoop-common/CHANGES.txt | 3 + 2 files changed, 207 insertions(+) create mode 100644 dev-support/determine-flaky-tests-hadoop.py diff --git a/dev-support/determine-flaky-tests-hadoop.py b/dev-support/determine-flaky-tests-hadoop.py new file mode 100644 index 0000000000..6f16ba5392 --- /dev/null +++ b/dev-support/determine-flaky-tests-hadoop.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Given a jenkins test job, this script examines all runs of the job done +# within specified period of time (number of days prior to the execution +# time of this script), and reports all failed tests. +# +# The output of this script includes a section for each run that has failed +# tests, with each failed test name listed. +# +# More importantly, at the end, it outputs a summary section to list all failed +# tests within all examined runs, and indicate how many runs a same test +# failed, and sorted all failed tests by how many runs each test failed. +# +# This way, when we see failed tests in PreCommit build, we can quickly tell +# whether a failed test is a new failure, or it failed before and how often it +# failed, so to have idea whether it may just be a flaky test. +# +# Of course, to be 100% sure about the reason of a test failure, closer look +# at the failed test for the specific run is necessary. +# +import sys +import platform +sysversion = sys.hexversion +onward30 = False +if sysversion < 0x020600F0: + sys.exit("Minimum supported python version is 2.6, the current version is " + + "Python" + platform.python_version()) + +if sysversion == 0x030000F0: + sys.exit("There is a known bug with Python" + platform.python_version() + + ", please try a different version"); + +if sysversion < 0x03000000: + import urllib2 +else: + onward30 = True + import urllib.request + +import datetime +import json as simplejson +import logging +from optparse import OptionParser +import time + +# Configuration +DEFAULT_JENKINS_URL = "https://builds.apache.org" +DEFAULT_JOB_NAME = "Hadoop-Common-trunk" +DEFAULT_NUM_PREVIOUS_DAYS = 14 + +SECONDS_PER_DAY = 86400 + +# total number of runs to examine +numRunsToExamine = 0 + +""" Parse arguments """ +def parse_args(): + parser = OptionParser() + parser.add_option("-J", "--jenkins-url", type="string", + dest="jenkins_url", help="Jenkins URL", + default=DEFAULT_JENKINS_URL) + parser.add_option("-j", "--job-name", type="string", + dest="job_name", help="Job name to look at", + default=DEFAULT_JOB_NAME) + parser.add_option("-n", "--num-days", type="int", + dest="num_prev_days", help="Number of days to examine", + default=DEFAULT_NUM_PREVIOUS_DAYS) + + (options, args) = parser.parse_args() + if args: + parser.error("unexpected arguments: " + repr(args)) + return options + +""" Load data from specified url """ +def load_url_data(url): + if onward30: + ourl = urllib.request.urlopen(url) + codec = ourl.info().get_param('charset') + content = ourl.read().decode(codec) + data = simplejson.loads(content) + else: + ourl = urllib2.urlopen(url) + data = simplejson.load(ourl) + return data + +""" List all builds of the target project. """ +def list_builds(jenkins_url, job_name): + url = "%(jenkins)s/job/%(job_name)s/api/json?tree=builds[url,result,timestamp]" % dict( + jenkins=jenkins_url, + job_name=job_name) + + try: + data = load_url_data(url) + + except: + logging.error("Could not fetch: %s" % url) + raise + return data['builds'] + +""" Find the names of any tests which failed in the given build output URL. """ +def find_failing_tests(testReportApiJson, jobConsoleOutput): + ret = set() + try: + data = load_url_data(testReportApiJson) + + except: + logging.error(" Could not open testReport, check " + + jobConsoleOutput + " for why it was reported failed") + return ret + + for suite in data['suites']: + for cs in suite['cases']: + status = cs['status'] + errDetails = cs['errorDetails'] + if (status == 'REGRESSION' or status == 'FAILED' or (errDetails is not None)): + ret.add(cs['className'] + "." + cs['name']) + + if len(ret) == 0: + logging.info(" No failed tests in testReport, check " + + jobConsoleOutput + " for why it was reported failed.") + return ret + +""" Iterate runs of specfied job within num_prev_days and collect results """ +def find_flaky_tests(jenkins_url, job_name, num_prev_days): + global numRunsToExamine + all_failing = dict() + # First list all builds + builds = list_builds(jenkins_url, job_name) + + # Select only those in the last N days + min_time = int(time.time()) - SECONDS_PER_DAY * num_prev_days + builds = [b for b in builds if (int(b['timestamp']) / 1000) > min_time] + + # Filter out only those that failed + failing_build_urls = [(b['url'] , b['timestamp']) for b in builds + if (b['result'] in ('UNSTABLE', 'FAILURE'))] + + tnum = len(builds) + num = len(failing_build_urls) + numRunsToExamine = tnum + logging.info(" THERE ARE " + str(num) + " builds (out of " + str(tnum) + + ") that have failed tests in the past " + str(num_prev_days) + " days" + + ((".", ", as listed below:\n")[num > 0])) + + for failed_build_with_time in failing_build_urls: + failed_build = failed_build_with_time[0]; + jobConsoleOutput = failed_build + "Console"; + testReport = failed_build + "testReport"; + testReportApiJson = testReport + "/api/json"; + + ts = float(failed_build_with_time[1]) / 1000. + st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') + logging.info("===>%s" % str(testReport) + " (" + st + ")") + failing = find_failing_tests(testReportApiJson, jobConsoleOutput) + if failing: + for ftest in failing: + logging.info(" Failed test: %s" % ftest) + all_failing[ftest] = all_failing.get(ftest,0)+1 + + return all_failing + +def main(): + global numRunsToExamine + logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) + + # set up logger to write to stdout + soh = logging.StreamHandler(sys.stdout) + soh.setLevel(logging.INFO) + logger = logging.getLogger() + logger.removeHandler(logger.handlers[0]) + logger.addHandler(soh) + + opts = parse_args() + logging.info("****Recently FAILED builds in url: " + opts.jenkins_url + + "/job/" + opts.job_name + "") + + all_failing = find_flaky_tests(opts.jenkins_url, opts.job_name, + opts.num_prev_days) + if len(all_failing) == 0: + raise SystemExit(0) + logging.info("\nAmong " + str(numRunsToExamine) + " runs examined, all failed " + + "tests <#failedRuns: testName>:") + + # print summary section: all failed tests sorted by how many times they failed + for tn in sorted(all_failing, key=all_failing.get, reverse=True): + logging.info(" " + str(all_failing[tn])+ ": " + tn) + +if __name__ == "__main__": + main() diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index ca147e8470..59116883be 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -386,6 +386,9 @@ Release 2.7.0 - UNRELEASED HADOOP-11490. Expose truncate API via FileSystem and shell command. (Milan Desai via shv) + HADOOP-11045. Introducing a tool to detect flaky tests of hadoop jenkins testing + job. (Yongjun Zhang and Todd Lipcon via ozawa) + IMPROVEMENTS HADOOP-11483. HardLink.java should use the jdk7 createLink method (aajisaka)