Maps, Sets, & Graphs

Ned Jackson Lovely

@nedjl / njl@njl.us

slides @ www.njl.us

Big-O Notation

This is all a lie!*

*Sloppy definitions, not enough math

Speed and Size Are

Part of the API

Maps

Dictionary

Associative Array

Symbol Table

Chuck Coker

The World's Biggest Hammer

Common Usage: Lookup

    >>> foo = {'red':'0xFF0000', 'blue':'0x0000FF',
               'green': '0x00FF00'}

    >>> print(foo['green'])
    "0x00FF00"

Common Usage: Arbitrary Data

{'code': 0, 'message': 'Success', 'version': '4.2'}

{'username': 'n', 'fullname':'Ned Jackson Lovely',
 'favorite LED color':'orange', 'number of pets': 2,
 'fun words': ['rutabaga', 'runcible', 'brouhaha', 'widdershins']}

Common Usage: Symbol Table

def add(a, b):
    return a+b
def subtract(a, b):
    return a-b
def multiply(a, b):
    return a*b
def divide(a, b):
    return a/b

OPERATIONS = {'+':add, '-':subtract,
                '*':multiply, '/':divide}

def calculator(a, op, b):
    return OPERATIONS[op](int(a), int(b))

>>> print calculator('57', '+', '123')
180

My Favorite Dict Function

    >>> this_thing = {'a':1, 'b':2}
    >>> this_thing.update({'c':5, 'a':17})
    >>> this_thing
    {"a":17, "b":2, "c":5}

A Toy Map Implementation

class MyHash(object):
    def __init__(self):
        self._entries = [None]*37

def calculate_index(key, array):
    h = hash(key)
    return h % len(array)

hash("this thing") == 482052534689916146
482052534689916146 % 37 == 7
def insert_into_array(key, value, array):
    index = calculate_index(key, array)
    if array[index] and array[index][0] != key:
        return False
    array[index] = (key, value)
    return True

>>> insert_into_array("this thing", "the value it maps to", my_array)
>>> print my_array[7]
("this thing", "the value it maps to")

    def add(self, key, value):
        if insert_into_array(key, value, self._entries):
            return

        #Resize
        length = len(self._entries)
        while True:
            length = length*2+1
            if self.resize(length):
                break
        self.add(key,value)

    def resize(self, length):
        new_array = [None]*length

        for entry in self._entries:
            if not entry:
                continue
            if not insert_into_array(entry[0], entry[1], new_array):
                return False

        self._entries = new_array
        return True

    def get(self, key, default=None):
        index = calculate_index(key, self._entries)
        if self._entries[index] and self._entries[index][0] == key:
            return self._entries[index][1]
        return default

    def member(self, key):
        index = calculate_index(key, self._entries)
        val = self._entries[index]
        return val and val[0] == key

    def keys(self):
        return [x[0] for x in self._entries if x]

    def values(self):
        return [x[1] for x in self._entries if x]

    def items(self):
        return [x for x in self._entries if x]

InsertionO(1) (except when it isn't)
LookupO(1)
DeletionO(1)
MembershipO(1)

Different Implementations, Different Tradeoffs

>>> foo = {}
>>> foo[{'a':1}] = 7
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  TypeError: unhashable type: 'dict'

Sets

Bags

Caffeinatrix

    things_which_are_awesome = set(['python', 'dogs', 'fire', 'cats'])
    'python' in things_which_are_awesome

    ones = set([1,1,1,1,1])
    len(ones) == 1

    some_primes = set([2,3,5,7,11])
    7 in some_primes
    my_things = set(['cat', 'dog', 'shoes', 'wombat'])
    your_things = set(['dog', 'shoes', 'rutabaga'])
    our_things = my_things.intersection(your_things)
    deduplicated = list(set(my_list_with_duplicates))

A Dictionary With Just Keys

Easiest Implementation
class MySet(object):
    def __init__(self, *args):
        self._values = MyHash()
        for k in args:
            self.add(k)

    def add(self, k):
        self._values.add(k, None)

    def values(self):
        return self._values.keys()

    def member(self, k):
        return self._values.member(k)

    def issubset(self, other):
        for k in self.values():
            if not other.member(k):
                return False
        return True

    a = MySet(1,2,3)
    b = MySet(1,2,3,4,5)

    assert not b.issubset(a)
    assert a.issubset(b)

    def intersection(self, other):
        rv = MySet()
        for k in self.values():
            if other.member(k):
                rv.add(k)
        return rv

    a = MySet(1,2,3)
    b = MySet(2,3,4,5)

    assert a.intersection(b) == MySet(2,3)

    def union(self, other):
        rv = MySet()
        for k in self.values():
            rv.add(k)
        for k in other.values():
            rv.add(k)
        return rv

    a = MySet(1,2,3,4)
    b = MySet(3,4,5,6)

    assert a.union(b) == MySet(1,2,3,4,5,6)

    def difference(self, other):
        rv = MySet()
        for k in self.values():
            if other.member(k):
                continue
            rv.add(k)
        return rv

    a = MySet(1,2,3,4)
    b = MySet(2,4,5)

    assert a.difference(b) == MySet(1,3)

Most of these are O(n)-ish
  • issubset
  • issuperset
  • union
  • intersection
  • difference
  • symmetric_difference

Graphs

MATH

There can be lots of it

Node (Vertex)

Edge (Arc)

Weight

Path

Cycle

Directed

Undirected

Clique

AWESOMENESS

There is a lot of it

Boil The Frog
Friendships
Website links
Roads
Course Prerequisites
Spelling Corrections
Chess Moves
Finite Automatons
Function Calls
Sparse Matrix

There are Two Common Implementation Methods

Let's use hash tables! We know hash tables!
class Graph(object):
    def __init__(self):
        self._nodes = {}
        self._id_counter = 0

    def add_node(self):
        id = self._id_counter
        self._nodes[id] = []
        self._id_counter += 1
        return id

    def add_edge(self, frm, to):
        self._nodes[frm].append(to)

    def get_nodes(self):
        return self._nodes.keys()

    def get_edges(self, id):
        return tuple(self._nodes[id])

Page Rank

(Dijkstra's Algorithm was too long)

import random
import collections

def page_rank(g, n, damping_factor=0.85):
    rv = collections.Counter()
    for _ in range(n):
        current = random.choice(g.get_nodes())
        while True:
            if random.random() > damping_factor:
                rv[current] += 1
                break
            current = random.choice(g.get_edges(current))
    return rv

import csv

def import_csv(f):
    inp = csv.DictReader(open(f))
    g = Graph()
    names = {}

    def get_or_create(t):
        if t not in names:
            names[t] = g.add_node(t)
        return names[t]

    for d in inp:
        l = get_or_create(d['Loser/tie'])
        w = get_or_create(d['Winner/tie'])
        g.add_edge(l,w)

    return g, {v:k for k,v in names.items()}

Results

Seattle Seahawks58,925
Carolina Panthers54,127
Indianapolis Colts54,049
Arizona Cardinals52,248
New Orleans Saints51,659
San Francisco 49ers51,467
Cincinnati Bengals49,183
New England Patriots48,620
Miami Dolphins47,232
San Diego Chargers40,057

Networkx

Thank You!

Ned Jackson Lovely

@nedjl / njl@njl.us

slides @ www.njl.us