001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.HashSet;
020import java.util.Set;
021
022/**
023 * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
024 * sequence. Jaccard similarity is the size of the intersection divided by the
025 * size of the union of the two sets.
026 *
027 * <p>
028 * For further explanation about Jaccard Similarity, refer
029 * https://en.wikipedia.org/wiki/Jaccard_index
030 * </p>
031 *
032 * @since 1.0
033 */
034public class JaccardSimilarity implements SimilarityScore<Double> {
035
036    /**
037     * Calculates Jaccard Similarity of two set character sequence passed as
038     * input.
039     *
040     * @param left first character sequence
041     * @param right second character sequence
042     * @return index
043     * @throws IllegalArgumentException
044     *             if either String input {@code null}
045     */
046    @Override
047    public Double apply(final CharSequence left, final CharSequence right) {
048        if (left == null || right == null) {
049            throw new IllegalArgumentException("Input cannot be null");
050        }
051        return calculateJaccardSimilarity(left, right);
052    }
053
054    /**
055     * Calculates Jaccard Similarity of two character sequences passed as
056     * input. Does the calculation by identifying the union (characters in at
057     * least one of the two sets) of the two sets and intersection (characters
058     * which are present in set one which are present in set two)
059     *
060     * @param left first character sequence
061     * @param right second character sequence
062     * @return index
063     */
064    private Double calculateJaccardSimilarity(final CharSequence left, final CharSequence right) {
065        final int leftLength = left.length();
066        final int rightLength = right.length();
067        if (leftLength == 0 && rightLength == 0) {
068            return 1d;
069        }
070        if (leftLength == 0 || rightLength == 0) {
071            return 0d;
072        }
073        final Set<Character> leftSet = new HashSet<>();
074        for (int i = 0; i < leftLength; i++) {
075            leftSet.add(left.charAt(i));
076        }
077        final Set<Character> rightSet = new HashSet<>();
078        for (int i = 0; i < rightLength; i++) {
079            rightSet.add(right.charAt(i));
080        }
081        final Set<Character> unionSet = new HashSet<>(leftSet);
082        unionSet.addAll(rightSet);
083        final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size();
084        return 1.0d * intersectionSize / unionSet.size();
085    }
086}