001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.HashSet; 020import java.util.Set; 021 022/** 023 * Measures the Jaccard similarity (aka Jaccard index) of two sets of character 024 * sequence. Jaccard similarity is the size of the intersection divided by the 025 * size of the union of the two sets. 026 * 027 * <p> 028 * For further explanation about Jaccard Similarity, refer 029 * https://en.wikipedia.org/wiki/Jaccard_index 030 * </p> 031 * 032 * @since 1.0 033 */ 034public class JaccardSimilarity implements SimilarityScore<Double> { 035 036 /** 037 * Calculates Jaccard Similarity of two set character sequence passed as 038 * input. 039 * 040 * @param left first character sequence 041 * @param right second character sequence 042 * @return index 043 * @throws IllegalArgumentException 044 * if either String input {@code null} 045 */ 046 @Override 047 public Double apply(final CharSequence left, final CharSequence right) { 048 if (left == null || right == null) { 049 throw new IllegalArgumentException("Input cannot be null"); 050 } 051 return calculateJaccardSimilarity(left, right); 052 } 053 054 /** 055 * Calculates Jaccard Similarity of two character sequences passed as 056 * input. Does the calculation by identifying the union (characters in at 057 * least one of the two sets) of the two sets and intersection (characters 058 * which are present in set one which are present in set two) 059 * 060 * @param left first character sequence 061 * @param right second character sequence 062 * @return index 063 */ 064 private Double calculateJaccardSimilarity(final CharSequence left, final CharSequence right) { 065 final int leftLength = left.length(); 066 final int rightLength = right.length(); 067 if (leftLength == 0 && rightLength == 0) { 068 return 1d; 069 } 070 if (leftLength == 0 || rightLength == 0) { 071 return 0d; 072 } 073 final Set<Character> leftSet = new HashSet<>(); 074 for (int i = 0; i < leftLength; i++) { 075 leftSet.add(left.charAt(i)); 076 } 077 final Set<Character> rightSet = new HashSet<>(); 078 for (int i = 0; i < rightLength; i++) { 079 rightSet.add(right.charAt(i)); 080 } 081 final Set<Character> unionSet = new HashSet<>(leftSet); 082 unionSet.addAll(rightSet); 083 final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size(); 084 return 1.0d * intersectionSize / unionSet.size(); 085 } 086}