问题描述
- 用simhash比较几句话的相似性
- 现在能做到的是把每个字的hash值求出并存放在string[]中,接下来就不太会了,求大神指导
解决方案
//words [0]为属性 [1]为权重
//hashbits hashCode权重
//return SimHash串
getSimHash(String[][] wordsint hashbits)
------------------------------------------------------------------
//计算汉明距离
//str1 simHash生成的code
//str2
//return 整形距离越小越相似
getDistance(str1str2)
----------------------------------------------------------------------
下面是代码
package com.yeahmobi.ymconv.util;public class MySimHash { public static String getSimHash(String[][] words int hashbits) { double[] hash = new double[hashbits]; for (int i = 0; i < words.length; i++) { long t = MurmurHash.hash64(words[i][0]); // long t = hash(words[i][0] 64).longValue(); String str = getZero(Long.toBinaryString(t) hashbits); for (int j = 0; j < str.length(); j++) { int weights = Integer.parseInt(words[i][1]) <= 0 ? 1 : Integer.parseInt(words[i][1]); int c = Integer.parseInt(str.charAt(j) + """"); if (c == 1) hash[j] = hash[j] + (weights); else hash[j] = hash[j] + (-weights); } } String hash1 = """"; for (double d : hash) { hash1 += d > 0 ? ""1"" : ""0""; } return hash1; } public static String getZero(String str int hashbits) { return String.format(""%"" + hashbits + ""s"" str).replace("" ""0""); } public static int getDistance(String str1 String str2) { int distance; if (str1.length() != str2.length()) { distance = -1; } else { distance = 0; for (int i = 0; i < str1.length(); i++) { if (str1.charAt(i) != str2.charAt(i)) { distance++; } } } return distance; } public static void main(String[] args) {// String s1 = MySimHash.getSimHash(new String[][] { { ""187.237.239.16""3"" } { ""mx""3"" } { ""775""3"" } { ""60541""3"" } { ""2342256""3"" } { ""alcatel""3"" } { ""onetouch5020""3"" } { ""android""3"" } { ""4.1.1""3"" } { ""hh""3"" } } 64);// String s2 = MySimHash.getSimHash(new String[][] { { ""177.224.174.214""1"" } { ""mx""1"" } { ""775""1"" } { ""6177""1"" } { ""2478822""1"" } { ""generic""1"" } { ""storm""1"" } { ""android""1"" } { ""4.2.2""1"" } } 64);// String s3 = MySimHash.getSimHash(new String[][] { { ""5.246.82.36""1"" } { ""sdf""1"" } { ""663""1"" } { ""333""1"" } { ""55""0"" } { ""sd""1"" } { ""er""1"" } { ""34""1"" } { ""sdfasdf""1"" } { ""hh""1"" } } 64);// String s4 = MySimHash.getSimHash(new String[][] { { ""189.132.168.157""1"" } { ""mx""1"" } { ""390""1"" } { ""3203""1"" } { ""2342277""1"" } { ""samsung""1"" } { ""gt-i8190l""1"" } { ""android""1"" } { ""4.1.2""1"" } } 64);// String s5 = MySimHash.getSimHash(new String[][] { { ""187.237.239.16""1"" } { ""mx""1"" } { ""775""3"" } { ""60541""1"" } { ""2342256""1"" } { ""alcatel""1"" } { ""onetouch5020""1"" } { ""android""1"" } { ""4.1.1""1"" } { ""hh""1"" } } 64);// String s6 = MySimHash.getSimHash(new String[][] { { ""187.237.239.25""3"" } { ""mx""3"" } { ""775""3"" } { ""60541""3"" } { ""2342256""3"" } { ""alcatel""3"" } { ""onetouch5020""3"" } { ""android""3"" } { ""4.1.1""3"" } { ""hh""3"" } } 64);// String s7 = MySimHash.getSimHash(new String[][] { { ""187.237.239.16""1"" } { ""mx""3"" } { ""775""3"" } { ""60541""3"" } { ""2342256""3"" } { ""alcatel""3"" } { ""onetouch5020""3"" } { ""android""3"" } { ""4.1.1""3"" } { ""hh""3"" } } 64);// System.out.println(""----------"");// System.out.println(MySimHash.getDistance(s1 s2));// System.out.println(MySimHash.getDistance(s1 s3));// System.out.println(MySimHash.getDistance(s1 s4));// System.out.println(MySimHash.getDistance(s1 s5));// System.out.println(MySimHash.getDistance(s1 s6));// System.out.println(MySimHash.getDistance(s1 s7));//// System.out.println(s1);// System.out.println(s2);// System.out.println(s3);// System.out.println(s4);// System.out.println(s5);// System.out.println(s6); }}
时间: 2025-01-30 00:23:03