基于Levenshtein距离算法的编辑距离算法

deltamaster posted @ Sep 16, 2011 03:31:15 PM in 人工智能 with tags 人工智能 模糊搜索 模式识别 编辑距离 , 1675 阅读

 

  public static final int getDistance(String word, String similar) {

    //get the weights for each possible operation
    final int costOfDeletingSourceCharacter = config.getInteger(Configuration.COST_REMOVE_CHAR);
    final int costOfInsertingSourceCharacter = config.getInteger(Configuration.COST_INSERT_CHAR);
    final int costOfSubstitutingLetters = config.getInteger(Configuration.COST_SUBST_CHARS);
    final int costOfSwappingLetters = config.getInteger(Configuration.COST_SWAP_CHARS);
    final int costOfChangingCase = config.getInteger(Configuration.COST_CHANGE_CASE);

    int a_size = word.length() + 1;
    int b_size = similar.length() + 1;
    int[][] matrix = new int[a_size][b_size];
    matrix[0][0] = 0;

    for (int i = 1; i != a_size; ++i)
      matrix[i][0] = matrix[i - 1][0] + costOfInsertingSourceCharacter; //initialize the first column

    for (int j = 1; j != b_size; ++j)
      matrix[0][j] = matrix[0][j - 1] + costOfDeletingSourceCharacter; //initalize the first row

    word = " " + word;
    similar = " " + similar;

    for (int i = 1; i != a_size; ++i) {
      char sourceChar = word.charAt(i);
      for (int j = 1; j != b_size; ++j) {

        char otherChar = similar.charAt(j);
        if (sourceChar == otherChar) {
          matrix[i][j] = matrix[i - 1][j - 1]; //no change required, so just carry the current cost up
          continue;
        }

        int costOfSubst = costOfSubstitutingLetters + matrix[i - 1][j - 1];
        //if needed, add up the cost of doing a swap
        int costOfSwap = Integer.MAX_VALUE;
        boolean isSwap = (i != 1) && (j != 1) && sourceChar == similar.charAt(j - 1) && word.charAt(i - 1) == otherChar;
        if (isSwap)
          costOfSwap = costOfSwappingLetters + matrix[i - 2][j - 2];

        int costOfDelete = costOfDeletingSourceCharacter + matrix[i][j - 1];
        int costOfInsertion = costOfInsertingSourceCharacter + matrix[i - 1][j];

        int costOfCaseChange = Integer.MAX_VALUE;
        String strSrcChar = "" + sourceChar;
        String strOtherChar = "" + otherChar;

        if (strSrcChar.compareToIgnoreCase(strOtherChar) == 0)
          costOfCaseChange = costOfChangingCase + matrix[i - 1][j - 1];

        matrix[i][j] = minimum(costOfSubst, costOfSwap, costOfDelete, costOfInsertion, costOfCaseChange);
      }
    }
    int cost = matrix[a_size - 1][b_size - 1];

    if (false)
      System.out.println(dumpMatrix(word, similar, matrix));

    return cost;
  }
* 本文在CC BY-SA(署名-相同方式共享)协议下发布。

登录 *


loading captcha image...
(输入验证码)
or Ctrl+Enter