CountingBloomFilter.java
上传用户:quxuerui
上传日期:2018-01-08
资源大小:41811k
文件大小:10k
- /**
- *
- * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
- * All rights reserved.
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- * - Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * - Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the distribution.
- * - Neither the name of the University Catholique de Louvain - UCL
- * nor the names of its contributors may be used to endorse or
- * promote products derived from this software without specific prior
- * written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.util.bloom;
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
- /**
- * Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN
- * 2000 paper.
- * <p>
- * A counting Bloom filter is an improvement to standard a Bloom filter as it
- * allows dynamic additions and deletions of set membership information. This
- * is achieved through the use of a counting vector instead of a bit vector.
- * <p>
- * Originally created by
- * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
- *
- * @see Filter The general behavior of a filter
- *
- * @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a>
- */
- public final class CountingBloomFilter extends Filter {
- /** Storage for the counting buckets */
- private long[] buckets;
- /** We are using 4bit buckets, so each bucket can count to 15 */
- private final static long BUCKET_MAX_VALUE = 15;
- /** Default constructor - use with readFields */
- public CountingBloomFilter() {}
-
- /**
- * Constructor
- * @param vectorSize The vector size of <i>this</i> filter.
- * @param nbHash The number of hash function to consider.
- * @param hashType type of the hashing function (see
- * {@link org.apache.hadoop.util.hash.Hash}).
- */
- public CountingBloomFilter(int vectorSize, int nbHash, int hashType) {
- super(vectorSize, nbHash, hashType);
- buckets = new long[buckets2words(vectorSize)];
- }
- /** returns the number of 64 bit words it would take to hold vectorSize buckets */
- private static int buckets2words(int vectorSize) {
- return ((vectorSize - 1) >>> 4) + 1;
- }
- @Override
- public void add(Key key) {
- if(key == null) {
- throw new NullPointerException("key can not be null");
- }
- int[] h = hash.hash(key);
- hash.clear();
- for(int i = 0; i < nbHash; i++) {
- // find the bucket
- int wordNum = h[i] >> 4; // div 16
- int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
-
- long bucketMask = 15L << bucketShift;
- long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
-
- // only increment if the count in the bucket is less than BUCKET_MAX_VALUE
- if(bucketValue < BUCKET_MAX_VALUE) {
- // increment by 1
- buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift);
- }
- }
- }
- /**
- * Removes a specified key from <i>this</i> counting Bloom filter.
- * <p>
- * <b>Invariant</b>: nothing happens if the specified key does not belong to <i>this</i> counter Bloom filter.
- * @param key The key to remove.
- */
- public void delete(Key key) {
- if(key == null) {
- throw new NullPointerException("Key may not be null");
- }
- if(!membershipTest(key)) {
- throw new IllegalArgumentException("Key is not a member");
- }
- int[] h = hash.hash(key);
- hash.clear();
- for(int i = 0; i < nbHash; i++) {
- // find the bucket
- int wordNum = h[i] >> 4; // div 16
- int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
-
- long bucketMask = 15L << bucketShift;
- long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
-
- // only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
- if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
- // decrement by 1
- buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift);
- }
- }
- }
- @Override
- public void and(Filter filter) {
- if(filter == null
- || !(filter instanceof CountingBloomFilter)
- || filter.vectorSize != this.vectorSize
- || filter.nbHash != this.nbHash) {
- throw new IllegalArgumentException("filters cannot be and-ed");
- }
- CountingBloomFilter cbf = (CountingBloomFilter)filter;
-
- int sizeInWords = buckets2words(vectorSize);
- for(int i = 0; i < sizeInWords; i++) {
- this.buckets[i] &= cbf.buckets[i];
- }
- }
- @Override
- public boolean membershipTest(Key key) {
- if(key == null) {
- throw new NullPointerException("Key may not be null");
- }
- int[] h = hash.hash(key);
- hash.clear();
- for(int i = 0; i < nbHash; i++) {
- // find the bucket
- int wordNum = h[i] >> 4; // div 16
- int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
- long bucketMask = 15L << bucketShift;
- if((buckets[wordNum] & bucketMask) == 0) {
- return false;
- }
- }
- return true;
- }
- /**
- * This method calculates an approximate count of the key, i.e. how many
- * times the key was added to the filter. This allows the filter to be
- * used as an approximate <code>key -> count</code> map.
- * <p>NOTE: due to the bucket size of this filter, inserting the same
- * key more than 15 times will cause an overflow at all filter positions
- * associated with this key, and it will significantly increase the error
- * rate for this and other keys. For this reason the filter can only be
- * used to store small count values <code>0 <= N << 15</code>.
- * @param key key to be tested
- * @return 0 if the key is not present. Otherwise, a positive value v will
- * be returned such that <code>v == count</code> with probability equal to the
- * error rate of this filter, and <code>v > count</code> otherwise.
- * Additionally, if the filter experienced an underflow as a result of
- * {@link #delete(Key)} operation, the return value may be lower than the
- * <code>count</code> with the probability of the false negative rate of such
- * filter.
- */
- public int approximateCount(Key key) {
- int res = Integer.MAX_VALUE;
- int[] h = hash.hash(key);
- hash.clear();
- for (int i = 0; i < nbHash; i++) {
- // find the bucket
- int wordNum = h[i] >> 4; // div 16
- int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
-
- long bucketMask = 15L << bucketShift;
- long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
- if (bucketValue < res) res = (int)bucketValue;
- }
- if (res != Integer.MAX_VALUE) {
- return res;
- } else {
- return 0;
- }
- }
- @Override
- public void not() {
- throw new UnsupportedOperationException("not() is undefined for "
- + this.getClass().getName());
- }
- @Override
- public void or(Filter filter) {
- if(filter == null
- || !(filter instanceof CountingBloomFilter)
- || filter.vectorSize != this.vectorSize
- || filter.nbHash != this.nbHash) {
- throw new IllegalArgumentException("filters cannot be or-ed");
- }
- CountingBloomFilter cbf = (CountingBloomFilter)filter;
- int sizeInWords = buckets2words(vectorSize);
- for(int i = 0; i < sizeInWords; i++) {
- this.buckets[i] |= cbf.buckets[i];
- }
- }
- @Override
- public void xor(Filter filter) {
- throw new UnsupportedOperationException("xor() is undefined for "
- + this.getClass().getName());
- }
- @Override
- public String toString() {
- StringBuilder res = new StringBuilder();
- for(int i = 0; i < vectorSize; i++) {
- if(i > 0) {
- res.append(" ");
- }
-
- int wordNum = i >> 4; // div 16
- int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4
-
- long bucketMask = 15L << bucketShift;
- long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
-
- res.append(bucketValue);
- }
- return res.toString();
- }
- // Writable
- @Override
- public void write(DataOutput out) throws IOException {
- super.write(out);
- int sizeInWords = buckets2words(vectorSize);
- for(int i = 0; i < sizeInWords; i++) {
- out.writeLong(buckets[i]);
- }
- }
- @Override
- public void readFields(DataInput in) throws IOException {
- super.readFields(in);
- int sizeInWords = buckets2words(vectorSize);
- buckets = new long[sizeInWords];
- for(int i = 0; i < sizeInWords; i++) {
- buckets[i] = in.readLong();
- }
- }
- }