| package com.mzl.flower.utils; | 
| import com.hankcs.hanlp.HanLP; | 
|   | 
| import com.hankcs.hanlp.dictionary.CustomDictionary; | 
| import org.apache.commons.lang3.StringUtils; | 
|   | 
|   | 
| import java.util.ArrayList; | 
|   | 
| import java.util.Collections; | 
|   | 
| import java.util.List; | 
|   | 
| import java.util.stream.Collectors; | 
|   | 
|   | 
|   | 
| public class SimilarityUtil { | 
|   | 
|     static { | 
|   | 
|         CustomDictionary.add("子类"); | 
|   | 
|         CustomDictionary.add("父类"); | 
|   | 
|     } | 
|   | 
|   | 
|   | 
|     private SimilarityUtil() { | 
|   | 
|     } | 
|   | 
|   | 
|   | 
|     /** | 
|   | 
|      * 获得两个句子的相似度 | 
|   | 
|      * | 
|   | 
|      * @param sentence1 | 
|   | 
|      * @param sentence2 | 
|   | 
|      * @return | 
|   | 
|      */ | 
|   | 
|     public static double getSimilarity(String sentence1, String sentence2) { | 
|         if(StringUtils.isBlank(sentence1) || StringUtils.isBlank(sentence2)){ | 
|             return 0; | 
|         } | 
|   | 
|         List<String> sent1Words = getSplitWords(sentence1); | 
|   | 
|         System.out.println(sent1Words); | 
|   | 
|         List<String> sent2Words = getSplitWords(sentence2); | 
|   | 
|         System.out.println(sent2Words); | 
|   | 
|         List<String> allWords = mergeList(sent1Words, sent2Words); | 
|   | 
|   | 
|   | 
|         int[] statistic1 = statistic(allWords, sent1Words); | 
|   | 
|         int[] statistic2 = statistic(allWords, sent2Words); | 
|   | 
|   | 
|   | 
|         double dividend = 0; | 
|   | 
|         double divisor1 = 0; | 
|   | 
|         double divisor2 = 0; | 
|   | 
|         for (int i = 0; i < statistic1.length; i++) { | 
|   | 
|             dividend += statistic1[i] * statistic2[i]; | 
|   | 
|             divisor1 += Math.pow(statistic1[i], 2); | 
|   | 
|             divisor2 += Math.pow(statistic2[i], 2); | 
|   | 
|         } | 
|   | 
|         return dividend / (Math.sqrt(divisor1) * Math.sqrt(divisor2)); | 
|   | 
|     } | 
|   | 
|   | 
|   | 
|     private static int[] statistic(List<String> allWords, List<String> sentWords) { | 
|   | 
|         int[] result = new int[allWords.size()]; | 
|   | 
|         for (int i = 0; i < allWords.size(); i++) { | 
|   | 
|             result[i] = Collections.frequency(sentWords, allWords.get(i)); | 
|   | 
|         } | 
|   | 
|         return result; | 
|   | 
|     } | 
|   | 
|   | 
|   | 
|     private static List<String> mergeList(List<String> list1, List<String> list2) { | 
|   | 
|         List<String> result = new ArrayList<>(); | 
|   | 
|         result.addAll(list1); | 
|   | 
|         result.addAll(list2); | 
|   | 
|         return result.stream().distinct().collect(Collectors.toList()); | 
|   | 
|     } | 
|   | 
|   | 
|   | 
|     private static List<String> getSplitWords(String sentence) { | 
|   | 
|         // 标点符号会被单独分为一个Term,去除之 | 
|   | 
|         return HanLP.segment(sentence.toLowerCase()).stream().map(a -> a.word).filter(s -> !"`~!@#$^&*()=|{}':;',\\[\\].<>/?~!@#¥……&*()——|{}【】‘;:”“'。,、? ".contains(s)).collect(Collectors.toList()); | 
|   | 
|     } | 
|   | 
|   | 
| } |