超导电机
java实现两个⽂本相似度simHash实现java实现⽐较两个⽂本相似度 simHash 实现 java实现两个⽂本相似度 simHash 实现
// An highlighted block
package com.util;
import com.hankcs.hanlp.HanLP;
import org.jsoup.Jsoup;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
public class SimilarityUtil {
/
**
*
* @param sentence1
* @param sentence2
* @return
*/
public static double getSimilarity(String sentence1, String sentence2){
List<String> sent1Words =getSplitWords(sentence1);
List<String> sent2Words =getSplitWords(sentence2);
27.5g bt
System.out.println(sent1Words);
电子发声挂图
System.out.println(sent2Words);
List<String> allWords =mergeList(sent1Words, sent2Words);
int[] statistic1 =statistic(allWords, sent1Words);
int[] statistic2 =statistic(allWords, sent2Words);
double dividend =0;
double divisor1 =0;
double divisor2 =0;
for(int i =0; i < statistic1.length; i++){
dividend += statistic1[i]* statistic2[i];
divisor1 += Math.pow(statistic1[i],2);
divisor2 += Math.pow(statistic2[i],2);
}
return dividend /(Math.sqrt(divisor1)* Math.sqrt(divisor2));
}
private static int[]statistic(List<String> allWords, List<String> sentWords){
int[] result =new int[allWords.size()];
for(int i =0; i < allWords.size(); i++){
result[i]= Collections.frequency(sentWords, (i));
}
return result;
}
private static List<String>mergeList(List<String> list1, List<String> list2){
淤泥固化剂
List<String> result =new ArrayList<>();
result.addAll(list1);
result.addAll(list2);
return result.stream().distinct().List());
}
private static List<String>getSplitWords(String sentence){
// 去除掉html标签
sentence = Jsoup.place(" ","")).body().text();
// 标点符号会被单独分为⼀个Term,去除之
return HanLP.segment(sentence).stream().map(a -> a.word).filter(s ->!"`~!@#$^&*()=|{}':;',\\[\\].<>/?~!@#¥……&*()——|{}【】‘;:”“'。,、? ". contains(s)).List());
}
}
delta并联机器人