YARN-9268. General improvements in FpgaDevice. Contributed by Peter Bacsko.
This commit is contained in:
parent
8739693514
commit
eeda6891e4
@ -21,6 +21,7 @@
|
||||
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -51,7 +52,7 @@ public class FpgaResourceAllocator {
|
||||
//key is resource type of FPGA, vendor plugin supported ID
|
||||
private LinkedHashMap<String, List<FpgaDevice>> availableFpga = new LinkedHashMap<>();
|
||||
|
||||
//key is requetor, aka. container ID
|
||||
//key is requestor, aka. container ID
|
||||
private LinkedHashMap<String, List<FpgaDevice>> usedFpgaByRequestor = new LinkedHashMap<>();
|
||||
|
||||
private Context nmContext;
|
||||
@ -133,35 +134,33 @@ public String toString() {
|
||||
}
|
||||
}
|
||||
|
||||
public static class FpgaDevice implements Comparable<FpgaDevice>, Serializable {
|
||||
/** A class that represents an FPGA card. */
|
||||
public static class FpgaDevice implements Serializable {
|
||||
private static final long serialVersionUID = -4678487141824092751L;
|
||||
private final String type;
|
||||
private final int major;
|
||||
private final int minor;
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private String type;
|
||||
private Integer major;
|
||||
private Integer minor;
|
||||
// IP file identifier. matrix multiplication for instance
|
||||
private String IPID;
|
||||
// SHA-256 hash of the uploaded aocx file
|
||||
private String aocxHash;
|
||||
// the device name under /dev
|
||||
private String devName;
|
||||
// the alias device name. Intel use acl number acl0 to acl31
|
||||
private String aliasDevName;
|
||||
// lspci output's bus number: 02:00.00 (bus:slot.func)
|
||||
private String busNum;
|
||||
private String temperature;
|
||||
private String cardPowerUsage;
|
||||
private final String aliasDevName;
|
||||
|
||||
// IP file identifier. matrix multiplication for instance (mutable)
|
||||
private String IPID;
|
||||
// SHA-256 hash of the uploaded aocx file (mutable)
|
||||
private String aocxHash;
|
||||
|
||||
// cached hash value
|
||||
private Integer hashCode;
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public Integer getMajor() {
|
||||
public int getMajor() {
|
||||
return major;
|
||||
}
|
||||
|
||||
public Integer getMinor() {
|
||||
public int getMinor() {
|
||||
return minor;
|
||||
}
|
||||
|
||||
@ -181,57 +180,16 @@ public void setIPID(String IPID) {
|
||||
this.IPID = IPID;
|
||||
}
|
||||
|
||||
public String getDevName() {
|
||||
return devName;
|
||||
}
|
||||
|
||||
public void setDevName(String devName) {
|
||||
this.devName = devName;
|
||||
}
|
||||
|
||||
public String getAliasDevName() {
|
||||
return aliasDevName;
|
||||
}
|
||||
|
||||
public void setAliasDevName(String aliasDevName) {
|
||||
this.aliasDevName = aliasDevName;
|
||||
}
|
||||
|
||||
public String getBusNum() {
|
||||
return busNum;
|
||||
}
|
||||
|
||||
public void setBusNum(String busNum) {
|
||||
this.busNum = busNum;
|
||||
}
|
||||
|
||||
public String getTemperature() {
|
||||
return temperature;
|
||||
}
|
||||
|
||||
public String getCardPowerUsage() {
|
||||
return cardPowerUsage;
|
||||
}
|
||||
|
||||
public FpgaDevice(String type, Integer major, Integer minor, String IPID) {
|
||||
this.type = type;
|
||||
public FpgaDevice(String type, int major, int minor, String aliasDevName) {
|
||||
this.type = Preconditions.checkNotNull(type, "type must not be null");
|
||||
this.major = major;
|
||||
this.minor = minor;
|
||||
this.IPID = IPID;
|
||||
}
|
||||
|
||||
public FpgaDevice(String type, Integer major,
|
||||
Integer minor, String IPID, String devName,
|
||||
String aliasDevName, String busNum, String temperature, String cardPowerUsage) {
|
||||
this.type = type;
|
||||
this.major = major;
|
||||
this.minor = minor;
|
||||
this.IPID = IPID;
|
||||
this.devName = devName;
|
||||
this.aliasDevName = aliasDevName;
|
||||
this.busNum = busNum;
|
||||
this.temperature = temperature;
|
||||
this.cardPowerUsage = cardPowerUsage;
|
||||
this.aliasDevName = Preconditions.checkNotNull(aliasDevName,
|
||||
"aliasDevName must not be null");
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -242,31 +200,48 @@ public boolean equals(Object obj) {
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (!(obj instanceof FpgaDevice)) {
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
FpgaDevice other = (FpgaDevice) obj;
|
||||
if (other.getType().equals(this.type) &&
|
||||
other.getMajor().equals(this.major) &&
|
||||
other.getMinor().equals(this.minor)) {
|
||||
return true;
|
||||
if (aliasDevName == null) {
|
||||
if (other.aliasDevName != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!aliasDevName.equals(other.aliasDevName)) {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
if (major != other.major) {
|
||||
return false;
|
||||
}
|
||||
if (minor != other.minor) {
|
||||
return false;
|
||||
}
|
||||
if (type == null) {
|
||||
if (other.type != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!type.equals(other.type)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((type == null) ? 0 : type.hashCode());
|
||||
result = prime * result + ((major == null) ? 0 : major.hashCode());
|
||||
result = prime * result + ((minor == null) ? 0 : minor.hashCode());
|
||||
return result;
|
||||
}
|
||||
if (hashCode == null) {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
|
||||
@Override
|
||||
public int compareTo(FpgaDevice o) {
|
||||
return 0;
|
||||
result = prime * result + major;
|
||||
result = prime * result + type.hashCode();
|
||||
result = prime * result + minor;
|
||||
result = prime * result + aliasDevName.hashCode();
|
||||
|
||||
hashCode = result;
|
||||
}
|
||||
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -149,8 +149,8 @@ public static List<FpgaDevice> parseDiagnosticOutput(
|
||||
|
||||
devices.add(new FpgaDevice(fpgaType,
|
||||
Integer.parseInt(mmn[0]),
|
||||
Integer.parseInt(mmn[1]), null,
|
||||
fields[0], aliasName, fields[1], fields[2], fields[3]));
|
||||
Integer.parseInt(mmn[1]),
|
||||
aliasName));
|
||||
} else {
|
||||
LOG.warn("Failed to retrieve major/minor number for device");
|
||||
}
|
||||
|
@ -153,7 +153,7 @@ public List<FpgaResourceAllocator.FpgaDevice> discover()
|
||||
// Replace list with a filtered one
|
||||
list = list
|
||||
.stream()
|
||||
.filter(dev -> minors.contains(dev.getMinor().toString()))
|
||||
.filter(dev -> minors.contains(String.valueOf(dev.getMinor())))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// if the count of user configured is still larger than actual
|
||||
|
@ -66,12 +66,7 @@ static List<FpgaDevice> getDevicesFromString(String type, String devices)
|
||||
fpgaDevices.add(new FpgaDevice(type,
|
||||
major,
|
||||
minor,
|
||||
null,
|
||||
null,
|
||||
devName,
|
||||
null,
|
||||
null,
|
||||
null));
|
||||
devName));
|
||||
} catch (NumberFormatException e) {
|
||||
throw new ResourceHandlerException(
|
||||
"Cannot parse major/minor number: " + deviceSpec);
|
||||
|
@ -112,7 +112,7 @@ public void setup() throws IOException {
|
||||
// Assumed devices parsed from output
|
||||
deviceList = new ArrayList<>();
|
||||
for (int i = 0; i < 5; i++) {
|
||||
deviceList.add(new FpgaDevice(vendorType, 247, i, null));
|
||||
deviceList.add(new FpgaDevice(vendorType, 247, i, "acl" + i));
|
||||
}
|
||||
String aocxPath = getTestParentFolder() + "/test.aocx";
|
||||
mockVendorPlugin = mockPlugin(vendorType, deviceList, aocxPath);
|
||||
@ -163,11 +163,11 @@ public void testBootstrap() throws ResourceHandlerException {
|
||||
for (String s : allowed.split(",")) {
|
||||
boolean check = false;
|
||||
for (FpgaDevice device : allowedDevices) {
|
||||
if (device.getMinor().toString().equals(s)) {
|
||||
if (String.valueOf(device.getMinor()).equals(s)) {
|
||||
check = true;
|
||||
}
|
||||
}
|
||||
Assert.assertTrue("Minor:" + s +"found", check);
|
||||
Assert.assertTrue("Minor:" + s +" found", check);
|
||||
}
|
||||
Assert.assertEquals(3,
|
||||
fpgaResourceHandler.getFpgaAllocator().getAvailableFpgaCount());
|
||||
@ -398,10 +398,10 @@ public void testStateStore()
|
||||
public void testReacquireContainer() throws ResourceHandlerException {
|
||||
Container c0 = mockContainer(0, 2, "GEMM");
|
||||
List<FpgaDevice> assigned = new ArrayList<>();
|
||||
assigned.add(new
|
||||
FpgaDevice(vendorType, 247, 0, null));
|
||||
assigned.add(new
|
||||
FpgaDevice(vendorType, 247, 1, null));
|
||||
assigned.add(new FpgaDevice(
|
||||
vendorType, 247, 0, "acl0"));
|
||||
assigned.add(new FpgaDevice(
|
||||
vendorType, 247, 1, "acl1"));
|
||||
// Mock we've stored the c0 states
|
||||
mockStateStoreForContainer(c0, assigned);
|
||||
// NM start
|
||||
@ -419,10 +419,10 @@ public void testReacquireContainer() throws ResourceHandlerException {
|
||||
getUsedFpga().get(getContainerId(0).toString());
|
||||
int count = 0;
|
||||
for (FpgaDevice device : used) {
|
||||
if (device.getMinor().equals(0)){
|
||||
if (device.getMinor() == 0){
|
||||
count++;
|
||||
}
|
||||
if (device.getMinor().equals(1)) {
|
||||
if (device.getMinor() == 1) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
@ -434,7 +434,7 @@ public void testReacquireContainer() throws ResourceHandlerException {
|
||||
.get(vendorType);
|
||||
count = 0;
|
||||
for (FpgaDevice device : available) {
|
||||
if (device.getMinor().equals(2)) {
|
||||
if (device.getMinor() == 2) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
@ -445,8 +445,8 @@ public void testReacquireContainer() throws ResourceHandlerException {
|
||||
// Case 2. Recover a not allowed device with minor number 5
|
||||
Container c1 = mockContainer(1, 1, "GEMM");
|
||||
assigned = new ArrayList<>();
|
||||
assigned.add(new
|
||||
FpgaDevice(vendorType, 247, 5, null));
|
||||
assigned.add(new FpgaDevice(
|
||||
vendorType, 247, 5, "acl0"));
|
||||
// Mock we've stored the c1 states
|
||||
mockStateStoreForContainer(c1, assigned);
|
||||
boolean flag = false;
|
||||
@ -464,8 +464,8 @@ public void testReacquireContainer() throws ResourceHandlerException {
|
||||
// Case 3. recover a already used device by other container
|
||||
Container c2 = mockContainer(2, 1, "GEMM");
|
||||
assigned = new ArrayList<>();
|
||||
assigned.add(new
|
||||
FpgaDevice(vendorType, 247, 1, null));
|
||||
assigned.add(new FpgaDevice(
|
||||
vendorType, 247, 1, "acl0"));
|
||||
// Mock we've stored the c2 states
|
||||
mockStateStoreForContainer(c2, assigned);
|
||||
flag = false;
|
||||
@ -483,8 +483,8 @@ public void testReacquireContainer() throws ResourceHandlerException {
|
||||
// Case 4. recover a normal container c3 with remaining minor device number 2
|
||||
Container c3 = mockContainer(3, 1, "GEMM");
|
||||
assigned = new ArrayList<>();
|
||||
assigned.add(new
|
||||
FpgaDevice(vendorType, 247, 2, null));
|
||||
assigned.add(new FpgaDevice(
|
||||
vendorType, 247, 2, "acl2"));
|
||||
// Mock we've stored the c2 states
|
||||
mockStateStoreForContainer(c3, assigned);
|
||||
fpgaResourceHandler.reacquireContainer(getContainerId(3));
|
||||
|
@ -83,31 +83,19 @@ public void testParsing() {
|
||||
|
||||
assertEquals(3, devices.size());
|
||||
assertEquals("IntelOpenCL", devices.get(0).getType());
|
||||
assertEquals("247", devices.get(0).getMajor().toString());
|
||||
assertEquals("0", devices.get(0).getMinor().toString());
|
||||
assertEquals(247, devices.get(0).getMajor());
|
||||
assertEquals(0, devices.get(0).getMinor());
|
||||
assertEquals("acl0", devices.get(0).getAliasDevName());
|
||||
assertEquals("aclnalla_pcie0", devices.get(0).getDevName());
|
||||
assertEquals("02:00.00", devices.get(0).getBusNum());
|
||||
assertEquals("53.1 degrees C", devices.get(0).getTemperature());
|
||||
assertEquals("31.7 Watts", devices.get(0).getCardPowerUsage());
|
||||
|
||||
assertEquals("IntelOpenCL", devices.get(1).getType());
|
||||
assertEquals("247", devices.get(1).getMajor().toString());
|
||||
assertEquals("1", devices.get(1).getMinor().toString());
|
||||
assertEquals(247, devices.get(1).getMajor());
|
||||
assertEquals(1, devices.get(1).getMinor());
|
||||
assertEquals("acl1", devices.get(1).getAliasDevName());
|
||||
assertEquals("aclnalla_pcie1", devices.get(1).getDevName());
|
||||
assertEquals("03:00.00", devices.get(1).getBusNum());
|
||||
assertEquals("43.1 degrees C", devices.get(1).getTemperature());
|
||||
assertEquals("11.7 Watts", devices.get(1).getCardPowerUsage());
|
||||
|
||||
assertEquals("IntelOpenCL", devices.get(2).getType());
|
||||
assertEquals("246", devices.get(2).getMajor().toString());
|
||||
assertEquals("0", devices.get(2).getMinor().toString());
|
||||
assertEquals(246, devices.get(2).getMajor());
|
||||
assertEquals(0, devices.get(2).getMinor());
|
||||
assertEquals("acl2", devices.get(2).getAliasDevName());
|
||||
assertEquals("acla10_ref0", devices.get(2).getDevName());
|
||||
assertEquals("09:00.00", devices.get(2).getBusNum());
|
||||
assertEquals("50.5781 degrees C", devices.get(2).getTemperature());
|
||||
assertEquals("", devices.get(2).getCardPowerUsage());
|
||||
|
||||
// Case 2. check alias map
|
||||
assertEquals("acl0", devices.get(0).getAliasDevName());
|
||||
|
@ -19,7 +19,6 @@
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga;
|
||||
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.mockito.ArgumentMatchers.anyInt;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
@ -175,12 +174,12 @@ public void testDiscoveryWhenAvailableDevicesDefined()
|
||||
FpgaDevice device1 = devices.get(1);
|
||||
|
||||
assertEquals("Device id", "acl0", device0.getAliasDevName());
|
||||
assertEquals("Minor number", new Integer(0), device0.getMinor());
|
||||
assertEquals("Major", new Integer(243), device0.getMajor());
|
||||
assertEquals("Minor number", 0, device0.getMinor());
|
||||
assertEquals("Major", 243, device0.getMajor());
|
||||
|
||||
assertEquals("Device id", "acl1", device1.getAliasDevName());
|
||||
assertEquals("Minor number", new Integer(1), device1.getMinor());
|
||||
assertEquals("Major", new Integer(244), device1.getMajor());
|
||||
assertEquals("Minor number", 1, device1.getMinor());
|
||||
assertEquals("Major", 244, device1.getMajor());
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -245,12 +244,12 @@ public void testDiscoveryWhenExternalScriptDefined()
|
||||
FpgaDevice device1 = devices.get(1);
|
||||
|
||||
assertEquals("Device id", "acl0", device0.getAliasDevName());
|
||||
assertEquals("Minor number", new Integer(0), device0.getMinor());
|
||||
assertEquals("Major", new Integer(243), device0.getMajor());
|
||||
assertEquals("Minor number", 0, device0.getMinor());
|
||||
assertEquals("Major", 243, device0.getMajor());
|
||||
|
||||
assertEquals("Device id", "acl1", device1.getAliasDevName());
|
||||
assertEquals("Minor number", new Integer(1), device1.getMinor());
|
||||
assertEquals("Major", new Integer(244), device1.getMajor());
|
||||
assertEquals("Minor number", 1, device1.getMinor());
|
||||
assertEquals("Major", 244, device1.getMajor());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Loading…
Reference in New Issue
Block a user