2016-06-03 17 views
-2

Ich habe Titel für jede Spalte in der CSV-Datei gespeichert und möchte Assoziationsregeln zwischen den Titeln und diesem Code für apriori algo returns itemsets und ihre Assoziationsregeln wie kann ich ändern sie gehört den Titel (Name der Kategorie Artikel zu erhalten, anstatt Gegenstände selbstWie man den Titel des Tupels anstelle des Tupelwerts erhält

""" 
 
Description  : Simple Python implementation of the Apriori Algorithm 
 

 
Usage: 
 
    $python apriori.py -f DATASET.csv -s minSupport -c minConfidence 
 

 
    $python apriori.py -f DATASET.csv -s 0.15 -c 0.6 
 
""" 
 

 
import sys 
 

 
from itertools import chain, combinations 
 
from collections import defaultdict 
 
from optparse import OptionParser 
 

 

 
def subsets(arr): 
 
    """ Returns non empty subsets of arr""" 
 
    return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)]) 
 

 

 
def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet): 
 
     """calculates the support for items in the itemSet and returns a subset 
 
     of the itemSet each of whose elements satisfies the minimum support""" 
 
     _itemSet = set() 
 
     localSet = defaultdict(int) 
 

 
     for item in itemSet: 
 
       for transaction in transactionList: 
 
         if item.issubset(transaction): 
 
           freqSet[item] += 1 
 
           localSet[item] += 1 
 

 
     for item, count in localSet.items(): 
 
       support = float(count)/len(transactionList) 
 

 
       if support >= minSupport: 
 
         _itemSet.add(item) 
 

 
     return _itemSet 
 

 

 
def joinSet(itemSet, length): 
 
     """Join a set with itself and returns the n-element itemsets""" 
 
     return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length]) 
 

 

 
def getItemSetTransactionList(data_iterator): 
 
    transactionList = list() 
 
    itemSet = set() 
 
    for record in data_iterator: 
 
     transaction = frozenset(record) 
 
     transactionList.append(transaction) 
 
     for item in transaction: 
 
      itemSet.add(frozenset([item]))    # Generate 1-itemSets 
 
    return itemSet, transactionList 
 

 

 
def runApriori(data_iter, minSupport, minConfidence): 
 
    """ 
 
    run the apriori algorithm. data_iter is a record iterator 
 
    Return both: 
 
    - items (tuple, support) 
 
    - rules ((pretuple, posttuple), confidence) 
 
    """ 
 
    itemSet, transactionList = getItemSetTransactionList(data_iter) 
 

 
    freqSet = defaultdict(int) 
 
    largeSet = dict() 
 
    # Global dictionary which stores (key=n-itemSets,value=support) 
 
    # which satisfy minSupport 
 

 
    assocRules = dict() 
 
    # Dictionary which stores Association Rules 
 

 
    oneCSet = returnItemsWithMinSupport(itemSet, 
 
             transactionList, 
 
             minSupport, 
 
             freqSet) 
 

 
    currentLSet = oneCSet 
 
    k = 2 
 
    while(currentLSet != set([])): 
 
     largeSet[k-1] = currentLSet 
 
     currentLSet = joinSet(currentLSet, k) 
 
     currentCSet = returnItemsWithMinSupport(currentLSet, 
 
               transactionList, 
 
               minSupport, 
 
               freqSet) 
 
     currentLSet = currentCSet 
 
     k = k + 1 
 

 
    def getSupport(item): 
 
      """local function which Returns the support of an item""" 
 
      return float(freqSet[item])/len(transactionList) 
 

 
    toRetItems = [] 
 
    for key, value in largeSet.items(): 
 
     toRetItems.extend([(tuple(item), getSupport(item)) 
 
          for item in value]) 
 

 
    toRetRules = [] 
 
    for key, value in largeSet.items()[1:]: 
 
     for item in value: 
 
      _subsets = map(frozenset, [x for x in subsets(item)]) 
 
      for element in _subsets: 
 
       remain = item.difference(element) 
 
       if len(remain) > 0: 
 
        confidence = getSupport(item)/getSupport(element) 
 
        if confidence >= minConfidence: 
 
         toRetRules.append(((tuple(element), tuple(remain)), 
 
              confidence)) 
 
    return toRetItems, toRetRules 
 

 

 
def printResults(items, rules): 
 
    """prints the generated itemsets sorted by support and the confidence rules sorted by confidence""" 
 
    for item, support in sorted(items, key=lambda (item, support): support): 
 
     print "item: %s , %.3f" % (str(item), support) 
 
    print "\n------------------------ RULES:" 
 
    for rule, confidence in sorted(rules, key=lambda (rule, confidence): confidence): 
 
     pre, post = rule 
 
     print "Rule: %s ==> %s , %.3f" % (str(pre), str(post), confidence) 
 

 

 
def dataFromFile(fname): 
 
     """Function which reads from the file and yields a generator""" 
 
     file_iter = open(fname, 'rU') 
 
     for line in file_iter: 
 
       line = line.strip().rstrip(',')       # Remove trailing comma 
 
       record = frozenset(line.split(',')) 
 
       yield record 
 

 

 
if __name__ == "__main__": 
 

 
    optparser = OptionParser() 
 
    optparser.add_option('-f', '--inputFile', 
 
         dest='input', 
 
         help='filename containing csv', 
 
         default=None) 
 
    optparser.add_option('-s', '--minSupport', 
 
         dest='minS', 
 
         help='minimum support value', 
 
         default=0.15, 
 
         type='float') 
 
    optparser.add_option('-c', '--minConfidence', 
 
         dest='minC', 
 
         help='minimum confidence value', 
 
         default=0.6, 
 
         type='float') 
 

 
    (options, args) = optparser.parse_args() 
 

 
    inFile = None 
 
    if options.input is None: 
 
      inFile = sys.stdin 
 
    elif options.input is not None: 
 
      inFile = dataFromFile(options.input) 
 
    else: 
 
      print 'No dataset filename specified, system with exit\n' 
 
      sys.exit('System will exit') 
 

 
    minSupport = options.minS 
 
    minConfidence = options.minC 
 

 
    items, rules = runApriori(inFile, minSupport, minConfidence) 
 

 
    printResults(items, rules)

+0

Was ist der "Titel des Tupels"? Ein Tupel ist das: '(1,2)' - wie lautet der Titel? –

+0

Angenommen, ich habe eine Tabelle in cvs die erste Zeile enthält die Attributnamen. Ich möchte diese Attributnamen extrahieren.Pardon mein schlechtes Englisch –

Antwort

0

Wenn die erste Zeile in der cSV die Namen jeder Spalte enthält, können Sie dies tun:

import csv 
with open('C:/mypath/to/csvfile.csv', 'r') as f: 
    d_reader = csv.DictReader(f) 
    headers = d_reader.fieldnames 
Verwandte Themen