All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tencent.angel.master.matrixmeta.AMMatrixMetaManager Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
/*
 * Tencent is pleased to support the open source community by making Angel available.
 *
 * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 * https://opensource.org/licenses/Apache-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 */


package com.tencent.angel.master.matrixmeta;

import com.tencent.angel.PartitionKey;
import com.tencent.angel.conf.AngelConf;
import com.tencent.angel.conf.MatrixConf;
import com.tencent.angel.exception.InvalidParameterException;
import com.tencent.angel.master.app.AMContext;
import com.tencent.angel.ml.matrix.*;
import com.tencent.angel.model.output.format.ModelFilesConstent;
import com.tencent.angel.model.output.format.MatrixFilesMeta;
import com.tencent.angel.model.output.format.MatrixPartitionMeta;
import com.tencent.angel.protobuf.ProtobufUtil;
import com.tencent.angel.ps.ParameterServerId;
import com.tencent.angel.ps.ha.RecoverPartKey;
import com.tencent.angel.ps.server.data.PSLocation;
import com.tencent.angel.ps.storage.matrix.PartitionState;
import com.tencent.angel.ps.storage.partitioner.Partitioner;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

/**
 * Matrix meta manager in angel application master. it contains all matrices meta and partitions for
 * each parameter server hold
 */
public class AMMatrixMetaManager {

  private static final Log LOG = LogFactory.getLog(AMMatrixMetaManager.class);
  private final AMContext context;

  /**
   * Matrix meta manager
   */
  private final MatrixMetaManager matrixMetaManager;

  /**
   * inverted index, psId--->Map( matrixId---->List), used for PS
   */
  private final Map> matrixPartitionsOnPS;

  /**
   * ps id to matrices on this ps map
   */
  private final Map> psIdToMatrixIdsMap;

  /**
   * matrix id to psId which has build partitions of this matrix map, use to add matrix
   */
  private final Map> matrixIdToPSSetMap;

  private final Map> psIdToRecoverPartsMap;

  /**
   * matrix id generator
   */
  private int maxMatrixId = 0;

  private final Lock readLock;
  private final Lock writeLock;

  public AMMatrixMetaManager(AMContext context) {
    this.context = context;
    matrixMetaManager = new MatrixMetaManager();
    matrixPartitionsOnPS = new HashMap<>();
    matrixIdToPSSetMap = new HashMap<>();
    psIdToMatrixIdsMap = new HashMap<>();
    psIdToRecoverPartsMap = new ConcurrentHashMap<>();

    ReadWriteLock readWriteLock = new ReentrantReadWriteLock();
    readLock = readWriteLock.readLock();
    writeLock = readWriteLock.writeLock();

    // Add one sync matrix
    // addSyncMatrix();
  }


  /**
   * Get matrix meta use matrix name
   *
   * @param matrixName matrix name
   * @return MatrixMeta matrix meta proto of the matrix, if not found, just return null
   */
  public MatrixMeta getMatrix(String matrixName) {
    return matrixMetaManager.getMatrixMeta(matrixName);
  }

  /**
   * Get matrix meta use matrix id
   *
   * @param matrixId matrix id
   * @return MatrixMeta matrix meta proto of the matrix
   */
  public MatrixMeta getMatrix(int matrixId) {
    return matrixMetaManager.getMatrixMeta(matrixId);
  }

  /**
   * get partitions of a specific parameter server hold
   *
   * @param psId, parameter server id
   * @return List the partitions of the parameter server hold
   */
  public final Map getMatrixPartitions(ParameterServerId psId) {
    try {
      readLock.lock();
      Map metaInPS = matrixPartitionsOnPS.get(psId);
      if (metaInPS == null) {
        return new HashMap<>();
      } else {
        return new HashMap<>(metaInPS);
      }
    } finally {
      readLock.unlock();
    }
  }

  /**
   * Create matrices
   *
   * @param matrixContexts matrices meta
   */
  public void createMatrices(List matrixContexts) throws Exception {
    int size = matrixContexts.size();
    for (int i = 0; i < size; i++) {
      createMatrix(matrixContexts.get(i));
    }
  }

  /**
   * Create a new matrix
   *
   * @param matrixContext matrix context
   */
  public void createMatrix(MatrixContext matrixContext) throws Exception {
    // Check whether the matrix name conflicts with the existing matrix names, the matrix name must be only
    if (matrixMetaManager.exists(matrixContext.getName())) {
      String errorMsg = "build matrix failed. matrix name " + matrixContext.getName()
          + " has exist, you must choose a new one";
      LOG.error(errorMsg);
      throw new InvalidParameterException(errorMsg);
    }

    MatrixMeta meta = initMatrixMeta(matrixContext);

    LOG.debug("after init matrix " + meta);

    matrixMetaManager.addMatrix(meta);

    try {
      writeLock.lock();
      buildPSMatrixMeta(meta);
    } finally {
      writeLock.unlock();
    }
  }

  private Partitioner initPartitioner(MatrixContext matrixContext, Configuration conf)
      throws IllegalAccessException, InvocationTargetException, InstantiationException,
      NoSuchMethodException {
    Class partitionerClass = matrixContext.getPartitionerClass();
    Constructor constructor = partitionerClass.getConstructor();
    constructor.setAccessible(true);
    Partitioner partitioner = constructor.newInstance();
    partitioner.init(matrixContext, conf);
    return partitioner;
  }

  private MatrixMeta initMatrixMeta(MatrixContext matrixContext)
      throws InvocationTargetException, NoSuchMethodException, InstantiationException,
      IllegalAccessException, IOException {

    try {
      writeLock.lock();
      matrixContext.setMatrixId(maxMatrixId++);
    } finally {
      writeLock.unlock();
    }

    String loadPath = matrixContext.getAttributes().get(MatrixConf.MATRIX_LOAD_PATH);
    Partitioner partitioner = initPartitioner(matrixContext, context.getConf());

    List partitions;
    if (loadPath != null) {
      partitions = loadPartitionInfoFromHDFS(loadPath, matrixContext, context.getConf());
    } else {
      if (matrixContext.getParts() != null && !matrixContext.getParts().isEmpty()) {
        List partKeys = matrixContext.getParts();
        partitions = new ArrayList<>(partKeys.size());
        int partId = 0;
        int matrixId = matrixContext.getMatrixId();
        for (PartContext part : partKeys) {
          partitions.add(new PartitionMeta(matrixId, partId++, part.getStartRow(), part.getEndRow(),
              part.getStartCol(), part.getEndCol(), part.getIndexNum()));
        }
      } else {
        partitions = partitioner.getPartitions();
      }
    }

    assignPSForPartitions(partitioner, partitions);
    assignReplicationSlaves(partitions);

    int size = partitions.size();
    Map partIdToMetaMap = new HashMap<>(size);
    for (int i = 0; i < size; i++) {
      partIdToMetaMap.put(partitions.get(i).getPartId(), partitions.get(i));
    }
    MatrixMeta meta = new MatrixMeta(matrixContext, partIdToMetaMap);
    return meta;
  }

  /**
   * Load matrix proto from hdfs.
   *
   * @param path the path
   * @param conf the conf
   * @return matrix partitions
   * @throws IOException the io exception
   */
  private List loadPartitionInfoFromHDFS(String path, MatrixContext matrixContext,
      Configuration conf) throws IOException {
    Path meteFilePath =
        new Path(new Path(path, matrixContext.getName()), ModelFilesConstent.modelMetaFileName);
    MatrixFilesMeta meta = new MatrixFilesMeta();

    FileSystem fs = meteFilePath.getFileSystem(conf);
    LOG.info("Load matrix meta for matrix " + matrixContext.getName());

    if (!fs.exists(meteFilePath)) {
      throw new IOException("matrix meta file does not exist ");
    }

    FSDataInputStream input = fs.open(meteFilePath);
    try {
      meta.read(input);
    } catch (Throwable e) {
      throw new IOException("Read meta failed ", e);
    } finally {
      input.close();
    }

    List matrixPartitions = new ArrayList<>();
    Map partMetas = meta.getPartMetas();

    int matrixId = 0;
    try {
      writeLock.lock();
      matrixId = maxMatrixId++;
    } finally {
      writeLock.unlock();
    }

    for (Map.Entry partMetaEntry : partMetas.entrySet()) {
      matrixPartitions.add(
          new PartitionMeta(matrixId, partMetaEntry.getKey(),
              partMetaEntry.getValue().getStartRow(),
              partMetaEntry.getValue().getEndRow(), partMetaEntry.getValue().getStartCol(),
              partMetaEntry.getValue().getEndCol()));
    }
    return matrixPartitions;
  }

  private void assignPSForPartitions(Partitioner partitioner, List partitions) {
    int partNum = partitions.size();
    for (int i = 0; i < partNum; i++) {
      int psIndex = partitioner.assignPartToServer(partitions.get(i).getPartId());
      ParameterServerId psId = new ParameterServerId(psIndex);
      partitions.get(i).addReplicationPS(psId);
      partitions.get(i).makePsToMaster(psId);
    }
  }

  private void assignReplicationSlaves(List partitions) {
    int replicationNum = context.getConf().getInt(AngelConf.ANGEL_PS_HA_REPLICATION_NUMBER,
        AngelConf.DEFAULT_ANGEL_PS_HA_REPLICATION_NUMBER);
    if (replicationNum <= 1) {
      return;
    }

    int psNum =
        context.getConf().getInt(AngelConf.ANGEL_PS_NUMBER, AngelConf.DEFAULT_ANGEL_PS_NUMBER);
    int size = partitions.size();
    for (int i = 0; i < size; i++) {
      assignReplicationSlaves(partitions.get(i), replicationNum - 1, psNum);
    }
  }

  private void assignReplicationSlaves(PartitionMeta partition, int slaveNum, int psNum) {
    int psIndex = partition.getMasterPs().getIndex();
    for (int i = 0; i < slaveNum; i++) {
      partition.addReplicationPS(new ParameterServerId((psIndex + i + 1) % psNum));
    }
  }

  /**
   * dispatch matrix partitions to parameter servers
   *
   * @param matrixMeta matrix meta proto
   */
  private void buildPSMatrixMeta(MatrixMeta matrixMeta) {
    Map partMetas = matrixMeta.getPartitionMetas();
    int matrixId = matrixMeta.getId();
    Set psIdSet = matrixIdToPSSetMap.get(matrixId);
    if (psIdSet == null) {
      psIdSet = new HashSet<>();
      matrixIdToPSSetMap.put(matrixId, psIdSet);
    }

    for (Entry partEntry : partMetas.entrySet()) {
      List psList = partEntry.getValue().getPss();
      int size = psList.size();
      for (int i = 0; i < size; i++) {
        ParameterServerId psId = psList.get(i);
        Map psMatrixIdToMetaMap = matrixPartitionsOnPS.get(psId);
        if (psMatrixIdToMetaMap == null) {
          psMatrixIdToMetaMap = new HashMap<>();
          matrixPartitionsOnPS.put(psId, psMatrixIdToMetaMap);
        }

        MatrixMeta psMatrixMeta = psMatrixIdToMetaMap.get(matrixId);
        if (psMatrixMeta == null) {
          psMatrixMeta = new MatrixMeta(matrixMeta.getMatrixContext());
          psMatrixIdToMetaMap.put(matrixId, psMatrixMeta);
        }

        psMatrixMeta.addPartitionMeta(partEntry.getKey(),
            new PartitionMeta(partEntry.getValue().getPartitionKey(),
                new ArrayList<>(partEntry.getValue().getPss())));

        psIdSet.add(psId);
      }
    }

    //    for(Entry> psEntry : matrixPartitionsOnPS.entrySet()) {
    //      LOG.info("ps id = " + psEntry.getKey());
    //      Map matrixIdToMetaMap = psEntry.getValue();
    //      for(Entry metaEntry : matrixIdToMetaMap.entrySet()) {
    //        LOG.info("matrix id = " + metaEntry.getKey());
    //        LOG.info("matrix partitons number = " + metaEntry.getValue().getPartitionMetas().size());
    //      }
    //    }
  }

  private void updateMaxMatrixId(int id) {
    if (maxMatrixId < id) {
      maxMatrixId = id;
    }
    LOG.debug("update maxMatrixId  to " + maxMatrixId);
  }

  /**
   * compare the matrix meta on the master and the matrix meta on ps to find the matrix this
   * parameter server needs to create and delete
   *
   * @param matrixReports parameter server matrix report, include the matrix ids this parameter
   * server hold.
   * @param needCreateMatrixes use to return the matrix partitions this parameter server need to
   * build
   * @param needReleaseMatrixes use to return the matrix ids this parameter server need to remove
   * @param needRecoverParts need recover partitions
   * @param psId parameter server id
   */
  public void syncMatrixInfos(List matrixReports, List needCreateMatrixes,
      List needReleaseMatrixes, List needRecoverParts,
      ParameterServerId psId) {

    //get matrix ids in the parameter server report
    IntOpenHashSet matrixInPS = new IntOpenHashSet();
    int size = matrixReports.size();
    for (int i = 0; i < size; i++) {
      matrixInPS.add(matrixReports.get(i).matrixId);
    }

    handleMatrixReports(psId, matrixReports);

    Set parts = getAndRemoveNeedRecoverParts(psId);
    if (parts != null) {
      needRecoverParts.addAll(parts);
    }

    //get the matrices parameter server need to create and delete
    getPSNeedUpdateMatrix(matrixInPS, needCreateMatrixes, needReleaseMatrixes, psId);
    psMatricesUpdate(psId, matrixReports);
  }

  private void handleMatrixReports(ParameterServerId psId, List matrixReports) {
    int size = matrixReports.size();
    for (int i = 0; i < size; i++) {
      handleMatrixReport(psId, matrixReports.get(i));
    }
  }

  private void handleMatrixReport(ParameterServerId psId, MatrixReport matrixReport) {
    int size = matrixReport.partReports.size();
    if (size > 0) {
      for (int i = 0; i < size; i++) {
        handlePartReport(psId, matrixReport.matrixId, matrixReport.partReports.get(i));
      }
    }
  }

  private void handlePartReport(ParameterServerId psId, int matrixId, PartReport partReport) {
    ParameterServerId master = matrixMetaManager.getMasterPs(matrixId, partReport.partId);
    if (!psId.equals(master)) {
      MatrixMeta matrixMeta = matrixMetaManager.getMatrixMeta(matrixId);
      if (matrixMeta == null) {
        return;
      }
      matrixMeta.getPartitionMeta(partReport.partId).addReplicationPS(psId);
      if (partReport.state == PartitionState.INITIALIZING) {
        addNeedRecoverPart(master, new RecoverPartKey(new PartitionKey(matrixId, partReport.partId),
            new PSLocation(psId, context.getLocationManager().getPsLocation(psId))));
      } else if (partReport.state == PartitionState.READ_AND_WRITE) {
        ParameterServerId orignalMaster =
            matrixPartitionsOnPS.get(psId).get(matrixId).getPartitionMeta(partReport.partId)
                .getMasterPs();
        if (orignalMaster.equals(psId)) {
          matrixMetaManager.getMatrixMeta(matrixId).getPartitionMeta(partReport.partId)
              .makePsToMaster(psId);
        }
      }
    }
  }

  private void addNeedRecoverPart(ParameterServerId master, RecoverPartKey needRecoverPart) {
    try {
      writeLock.lock();
      Set needRecoverParts = psIdToRecoverPartsMap.get(master);
      if (needRecoverParts == null) {
        needRecoverParts = new HashSet<>();
        psIdToRecoverPartsMap.put(master, needRecoverParts);
      }
      needRecoverParts.add(needRecoverPart);
    } finally {
      writeLock.unlock();
    }
  }

  public Set getAndRemoveNeedRecoverParts(ParameterServerId master) {
    try {
      writeLock.lock();
      return psIdToRecoverPartsMap.remove(master);
    } finally {
      writeLock.unlock();
    }
  }

  private void getPSNeedUpdateMatrix(Set matrixIdInPSSet,
      List needCreateMatrixes, List needReleaseMatrixes,
      ParameterServerId psId) {
    try {
      readLock.lock();
      Map matrixIdToPSMetaMap = matrixPartitionsOnPS.get(psId);

      if (matrixIdToPSMetaMap == null) {
        return;
      }

      //if a matrix exists on parameter server but not exist on master, we should notify the parameter server to remove this matrix
      for (int matrixId : matrixIdInPSSet) {
        LOG.debug("matrix in ps " + matrixId);
        if (!matrixIdToPSMetaMap.containsKey(matrixId)) {
          LOG.debug("matrix " + matrixId + " need release");
          needReleaseMatrixes.add(matrixId);
        }
      }

      //if a matrix exists on master but not exist on parameter server, this parameter server need build it.
      for (Entry psMatrixEntry : matrixIdToPSMetaMap.entrySet()) {
        LOG.debug(
            "matrix in master " + psMatrixEntry.getKey() + ", " + psMatrixEntry.getValue()
                .getName());
        if (!matrixIdInPSSet.contains(psMatrixEntry.getKey())) {
          needCreateMatrixes.add(psMatrixEntry.getValue());
        }
      }
    } finally {
      readLock.unlock();
    }
  }

  /**
   * Update the matrices on the PS
   */
  private void psMatricesUpdate(ParameterServerId psId, List matrixReports) {
    try {
      writeLock.lock();
      Set matrixIdSet = psIdToMatrixIdsMap.get(psId);
      if (matrixIdSet == null) {
        matrixIdSet = new HashSet();
        psIdToMatrixIdsMap.put(psId, matrixIdSet);
      }

      int size = matrixReports.size();
      for (int i = 0; i < size; i++) {
        matrixIdSet.add(matrixReports.get(i).matrixId);
      }
    } finally {
      writeLock.unlock();
    }
  }

  /**
   * Release a matrix. just release matrix meta on master
   *
   * @param matrixId the matrix need release
   */
  public void releaseMatrix(int matrixId) {
    try {
      writeLock.lock();
      matrixMetaManager.removeMatrix(matrixId);
      matrixIdToPSSetMap.remove(matrixId);

      for (Map psMatrixMap : matrixPartitionsOnPS.values()) {
        psMatrixMap.remove(matrixId);
      }
    } finally {
      writeLock.unlock();
    }
  }

  /**
   * Release a matrix. just release matrix meta on master
   *
   * @param matrixName the matrix need release
   */
  public void releaseMatrix(String matrixName) {
    try {
      writeLock.lock();
      int matrixId = matrixMetaManager.getMatrixId(matrixName);
      if (matrixId < 0) {
        return;
      }

      matrixMetaManager.removeMatrix(matrixId);
      matrixIdToPSSetMap.remove(matrixId);

      for (Map psMatrixMap : matrixPartitionsOnPS.values()) {
        psMatrixMap.remove(matrixId);
      }
    } finally {
      writeLock.unlock();
    }
  }

  /**
   * write matrix meta protos to output stream
   *
   * @param output output stream
   */
  public void serialize(FSDataOutputStream output) throws IOException {
    Map matrices = matrixMetaManager.getMatrixMetas();
    for (MatrixMeta meta : matrices.values()) {
      ProtobufUtil.convertToMatrixMetaProto(meta).writeDelimitedTo(output);
    }
  }

  /**
   * read matrix meta protos from input stream
   *
   * @param input input stream
   * @throws IOException, InvalidParameterException
   */
  public void deserialize(FSDataInputStream input)
      throws IOException, InvalidParameterException, ClassNotFoundException {
    while (input.available() > 0) {
      MatrixMeta meta = ProtobufUtil.convertToMatrixMeta(ProtobufUtil.loadMatrixMetaProto(input));
      matrixMetaManager.addMatrix(meta);
      try {
        writeLock.lock();
        buildPSMatrixMeta(meta);
      } finally {
        writeLock.unlock();
      }
    }
  }

  /**
   * Get ps ids which contains the matrix
   *
   * @param matrixId matrix id
   * @return ps id set
   */
  public Set getPsIds(int matrixId) {
    return matrixIdToPSSetMap.get(matrixId);
  }

  /**
   * Get master ps ids which contains the matrix
   *
   * @param matrixId matrix id
   * @return ps id set
   */
  public Set getMasterPsIds(int matrixId) {
    Set psSet = new HashSet<>();
    Map partMetas =
        matrixMetaManager.getMatrixMeta(matrixId).getPartitionMetas();
    for (PartitionMeta partMeta : partMetas.values()) {
      psSet.add(partMeta.getMasterPs());
    }
    return psSet;
  }

  public Map getMatrixMetas() {
    return matrixMetaManager.getMatrixMetas();
  }

  public boolean isCreated(String matrixName) {
    if (!matrixMetaManager.exists(matrixName)) {
      return false;
    }
    return isCreated(matrixMetaManager.getMatrixId(matrixName));
  }

  public boolean isCreated(int matrixId) {
    boolean inited = true;

    try {
      readLock.lock();
      if (!matrixMetaManager.exists(matrixId)) {
        return false;
      }

      Set psIdSet = matrixIdToPSSetMap.get(matrixId);

      if (psIdSet == null || psIdSet.isEmpty()) {
        return false;
      }

      inited = true;
      for (ParameterServerId psId : psIdSet) {
        if (!psIdToMatrixIdsMap.containsKey(psId) || !psIdToMatrixIdsMap.get(psId)
            .contains(matrixId)) {
          inited = false;
          break;
        }
      }
    } finally {
      readLock.unlock();
    }

    return inited;
  }

  public List getPss(int matrixId, int partId) {
    return matrixMetaManager.getPss(matrixId, partId);
  }

  public boolean isAllMatricesCreated() {
    boolean isCreated = true;
    try {
      readLock.lock();
      for (int matrixId : matrixMetaManager.getMatrixMetas().keySet()) {
        isCreated = isCreated && isCreated(matrixId);
        if (!isCreated) {
          break;
        }
      }
    } finally {
      readLock.unlock();
    }

    return isCreated;
  }

  public void psFailed(ParameterServerId psId) {
    matrixMetaManager.removePs(psId);
  }

  public void psRecovered(ParameterServerId psId) {
    Map matrixIdToMetaMap = matrixPartitionsOnPS.get(psId);
    if (matrixIdToMetaMap == null) {
      return;
    }

    for (MatrixMeta meta : matrixIdToMetaMap.values()) {
      Map partMetas = meta.getPartitionMetas();
      if (partMetas == null) {
        continue;
      }

      for (PartitionMeta partMeta : partMetas.values()) {
        matrixMetaManager.addPs(meta.getId(), partMeta.getPartId(), psId);
      }
    }
  }

  public List getMasterPartsInPS(int matrixId, ParameterServerId psId) {
    List needSavePartInPS = new ArrayList<>();
    MatrixMeta matrixMeta = matrixMetaManager.getMatrixMeta(matrixId);
    for (PartitionMeta partMeta : matrixMeta.getPartitionMetas().values()) {
      if (psId.equals(partMeta.getMasterPs())) {
        needSavePartInPS.add(partMeta.getPartId());
      }
    }

    return needSavePartInPS;
  }

  /**
   * Check a matrix exist or not
   *
   * @param matrixName matrix name
   * @return true means exist
   */
  public boolean exist(String matrixName) {
    return matrixMetaManager.exists(matrixName);
  }


  private void addSyncMatrix() {
    MatrixContext syncMatrix = new MatrixContext("sync_1", 1, 1);
    try {
      createMatrix(syncMatrix);
    } catch (Exception e) {
      LOG.error("Create sync matrix failed", e);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy