From 0cfaadd4b7e75713942bddb4d2221a8b8ea75343 Mon Sep 17 00:00:00 2001
From: Sanjay Krishnan
Date: Wed, 29 May 2019 18:26:12 -0500
Subject: [PATCH] Added in homework 4!!
---
hw4/README.md | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
hw4/core.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
hw4/hyperloglog.py | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
hw4/tests.py | 18 ++++++++++++++++++
4 files changed, 252 insertions(+)
create mode 100644 hw4/README.md
create mode 100644 hw4/core.py
create mode 100644 hw4/hyperloglog.py
create mode 100644 hw4/tests.py
diff --git a/hw4/README.md b/hw4/README.md
new file mode 100644
index 0000000..540b071
--- /dev/null
+++ b/hw4/README.md
@@ -0,0 +1,66 @@
+# Sketching
+
+*Due 6/11/19 11:59 PM (for all students)*
+*Due 6/6/19 11:59 PM (for all graduating seniors)*
+
+In this assignment, you will revisit the IMDB dataset and explore approximate ways of counting distinct values.
+
+## Getting Started
+First, pull the most recent changes from the cmsc13600-public repository:
+```
+$ git pull
+```
+Then, copy the `hw4` folder to your submission repository. Change directories to enter your submission repository. Your code will go into `hyperloglog.py` this is the only file that you will modify. Finally, add `hyperloglog.py` using `git add`:
+```
+$ git add hyperloglog.py
+$ git commit -m'initialized homework'
+```
+
+Now, you will need to fetch the data used in this assignment. Download title.csv put it in the hw4 folder:
+
+https://www.dropbox.com/s/zl7yt8cl0lvajxg/title.csv?dl=0
+
+DO NOT ADD title.csv to the git repo! After downloading the
+dataset, there is a python module provided for you called `core.py`, which reads the dataset. This module loads the data in as
+an iterator in two functions `imdb_years()` and `imdb_title_words()`:
+```
+>> for i in imdb_years():
+... print(i)
+
+1992
+1986
+
+```
+Play around with both `imdb_years()` and `imdb_title_words()` to get a feel for how the data works. You will also find it useful to install NumPy for this project:
+```
+$ pip3 install numpy
+```
+## Helper Functions
+You will first write a series of helper functions to manipulate hash codes.
+
+### hashcode(s)
+The first function to implement is `hashcode(s)`. This function takes in a string and returns a a 32-bit unsigned integer. You will use the python function `hash(obj)` to implement this function. Hint: Python uses 64 bit integers and you will find the numpy library useful to converting the hash code into a 32 bit number.
+
+### code2bin(code)
+The next function to implement is `code2bin(code)`. This function takes in a 32-bit unsigned code from the previous example and turns it into a binary list of 32 1/0 values.
+The list must be ordered from most significant to least significant bit, i.e., the most significant bit is the first element of the list.
+
+### bin2code(lst)
+This function takes in a 32-bit binary list and returns a unsigned 32-bit integer; assuming most significant bit first as before. This should be able to retrive your original code value!
+
+### rho(lst)
+The last helper function you will write is `rho(lst)` which returns the position of the left-most bit that is one. If there is no bit with the value one, just return the length of lst.
+
+## HyperLogLog Algorithm
+The gist of HyperLogLog is that it maintains a compressed array (`self.M`) that summarizes the data. For each record in your dataset, it updates this array. `self.b` determines how big this compressed array is. At the end it tries to reconstruct the distinct count from this array. You will write the `update(s)` function for HyperLogLog. The update function takes an input string `s` and updates self.M accordingly:
+1. Calculate the hash code of s
+2. Calculate the binary representation of the hashcode (call it c)
+3. Split this binary representation into two sublists b- (the first b elements) and b+ (the remaining elements)
+4. Use your helper functions to calculate j: the integer representation of (b-), and w: the position of the left-most bit in b+.
+5. Update with `self.M[j] = max(self.M[j], w)`
+
+## Testing and Submission
+ We have provided a basic wrapper in `test.py`, the test is incomplete and is not meant to comprehensively grade your assignment. If you follow directions closely you should usually be within 50 of the true count for `imdb_years()`. After you finish the assignment you can submit your code with:
+```
+$ git push
+```
diff --git a/hw4/core.py b/hw4/core.py
new file mode 100644
index 0000000..92da07f
--- /dev/null
+++ b/hw4/core.py
@@ -0,0 +1,51 @@
+'''
+The core module sets up the data structures and
+and references for this programming assignment.
+'''
+
+import platform
+import os
+import json
+
+if platform.system() == 'Windows':
+ print("This assignment will not work on a windows computer")
+ exit()
+
+def imdb_title_words():
+ f = open('title.csv','r')
+ line = f.readline()
+
+ while line != "":
+ words = line.strip().split(',')[1].split()
+
+ for w in words:
+ yield w
+
+ line = f.readline()
+
+ f.close()
+
+def imdb_years():
+ f = open('title.csv','r')
+ line = f.readline()
+
+ cnt = 0
+
+ while line != "":
+ csvsplit = line.strip().split(',')
+ year = csvsplit[len(csvsplit) - 8]
+
+ if year.strip() != "":
+ yield year
+
+ line = f.readline()
+ cnt += 1
+
+ f.close()
+
+
+
+
+
+
+
diff --git a/hw4/hyperloglog.py b/hw4/hyperloglog.py
new file mode 100644
index 0000000..e3a92c3
--- /dev/null
+++ b/hw4/hyperloglog.py
@@ -0,0 +1,117 @@
+"""
+The HyperLogLog algorithm is an algorithm designed to count the number
+of distinct values in a set when the set is too large to fit in memory.
+Like MinHash, it uses probabilistic methods to approximate the true
+value of the distinct cardinality.
+
+Write your implementation of the HyperLogLog algorithm here. The outline
+of the algorithm can be found in the paper,
+
+ "HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm"
+ by Flajolet et al.
+ (http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)
+
+The assignment asks you to implement a simplification of this algorithm. Figure 3 in the
+paper describes the whole algorithm
+
+We will test your implementation against the IMDB dataset (title.csv) used in HW1. View
+the tests.py file to see how the class will be used. The method skeletons provided are
+for your convenience, but only the __init__, count, and update methods will be used for
+tests.
+
+Your implementation should run in less than 5 minutes. Do not use Python counting data
+structures such as Counter, histograms, or dictionaries adapted for counting purposes.
+"""
+import numpy as np
+
+# Step 1.
+def hashcode(s):
+ '''
+ This function takes in a string and returns a a 32-bit
+ unsigned integer.
+ '''
+
+ #YOUR CODE GOES HERE!!!
+ pass
+
+
+# Step 2.
+def code2bin(code):
+ '''
+ This function takes in a 32-bit unsigned code from
+ the previous example and turns it into a binary
+ list of 32 1/0 values. The most significant bit is
+ the first element of the list.
+ '''
+
+ #YOUR CODE GOES HERE!!!
+
+
+# Step 3.
+def bin2code(lst):
+ '''
+ This function takes in a 32-bit binary list
+ and returns a unsigned 32-bit integer. Assuming
+ most significant bit first.
+ '''
+
+ #YOUR CODE GOES HERE!!!
+
+#Step 4.
+def rho(lst):
+ '''
+ Returns the position of the left-most bit
+ that is one.
+ '''
+
+ #YOUR CODE GOES HERE!!!
+
+
+
+class HyperLogLog(object):
+ '''
+ HyperLogLog is an approximate distinct count
+ data structure.
+ '''
+
+ def __init__(self, b):
+ '''
+ Takes in an integer parameter b > 4 and less than 16
+ '''
+ self.b = b
+ self.M = [0]*(2 ** b)
+
+
+ #given to you don't worry about it
+ def get_alpha(self):
+ m = 2 ** self.b
+ if m == 16:
+ return 0.673
+ elif m == 32:
+ return 0.697
+ elif m == 64:
+ return 0.709
+ else:
+ return 0.7213/(1+ 1.079/m)
+
+
+ #given to you don't worry about it
+ def count(self):
+ m = 2 ** self.b
+ a = self.get_alpha()
+
+ reg_sum = 0
+ for j in range(0,m):
+ reg_sum += 2 ** (- self.M[j])
+
+ E = 2*a*(m**2)/reg_sum
+
+ return E
+
+
+ #your code
+ def update(self, s):
+ #YOUR CODE GOES HERE!!!
+
+
+
diff --git a/hw4/tests.py b/hw4/tests.py
new file mode 100644
index 0000000..8f60dac
--- /dev/null
+++ b/hw4/tests.py
@@ -0,0 +1,18 @@
+from hyperloglog import HyperLogLog
+from core import imdb_years
+from collections import Counter
+
+def main_test():
+
+ y = HyperLogLog(b = 4)
+ z = Counter()
+
+ for i, w in enumerate(imdb_years()):
+ y.update(w.encode('utf-8'))
+ z[w] += 1
+
+ print("Implemented: {}\nTrue count: {}".format(y.count(), len(z)))
+
+if __name__ == "__main__":
+ main_test()
+
--
libgit2 0.25.0