Association Rule more using python

3 minute read

기존에 했던, mlstend 에서 추가기능 활용을 하기 위한 포스팅이다.

import pandas as pd
import numpy as np
import itertools ## 조합만들때, 필요하다.
mdf = pd.read_csv('D:/★2020_ML_DL_Project/Alchemy/dataset/marketbasket.csv',encoding='UTF8',header='infer')
print(mdf.shape) ## 트랜잭션 수는 1361 건이다.
print(mdf.columns) ## 품목이 255건이나 된다.
mdf.head()
(315, 7)
Index(['0', '1', '2', '3', '4', '5', '6'], dtype='object')
0 1 2 3 4 5 6
0 Bread Wine Eggs Meat Cheese Pencil Diaper
1 Bread Cheese Meat Diaper Wine Milk Pencil
2 Cheese Meat Eggs Milk Wine NaN NaN
3 Cheese Meat Eggs Milk Wine NaN NaN
4 Meat Pencil Wine NaN NaN NaN NaN
new_columns = mdf.columns.str.strip().to_list()
mdf.columns = new_columns
mdf.head(2)
Hair Conditioner Lemons Standard coffee Frozen Chicken Wings 98pct. Fat Free Hamburger Sugar Cookies Onions Deli Ham Dishwasher Detergent Beets ... Lollipops Plain White Bread Blueberry Yogurt Frozen Chicken Thighs Mixed Vegetables Souring Pads Tuna Spread Toilet Paper White Wine Columbian Coffee
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 255 columns

mlxtend 활용

from tqdm import tqdm
tqdm.pandas()
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules,fpgrowth

TransactionEncoder

  • 연관성분석 하기 좋은 데이터셋으로, 변환시켜주는 라이브러리

    cust_id,[item01,item02,item03 ~ item100] 처럼 만들어준다.

본 데이터셋에서는 이미, 변환이 되어 있기 때문에 필요없다. [2차원 list or array 를 형태를 받아서, 변환시킨다.]

te = TransactionEncoder() 예시
te_rslt = te.fit(mdf_lst).transform(mdf_lst) 예시

mdf.shape
(1361, 255)
item_set = fpgrowth(mdf,min_support=0.05,use_colnames=True)
item_set
support itemsets
0 0.055841 ( Plums)
1 0.057311 ( Pancake Mix)
2 0.109478 ( 2pct. Milk)
3 0.119030 ( White Bread)
4 0.097722 ( Potato Chips)
5 0.093314 ( 98pct. Fat Free Hamburger)
6 0.074210 ( Toilet Paper)
7 0.080088 ( Onions)
8 0.071271 ( Hamburger Buns)
9 0.058780 ( French Fries)
10 0.055107 ( Sugar Cookies)
11 0.092579 ( Hot Dogs)
12 0.067597 ( Domestic Beer)
13 0.063924 ( Popcorn Salt)
14 0.058780 ( Hair Conditioner)
15 0.051433 ( Waffles)
16 0.122704 ( Eggs)
17 0.085231 ( Sweet Relish)
18 0.079353 ( Toothpaste)
19 0.066128 ( Tomatoes)
20 0.054372 ( Canned Tuna)
21 0.053637 ( Apples)
22 0.050698 ( Sour Cream)
23 0.077884 ( Cola)
24 0.069067 ( Pepperoni Pizza - Frozen)
25 0.062454 ( Ramen Noodles)
26 0.058780 ( Hot Dog Buns)
27 0.057311 ( Garlic)
28 0.077149 ( Wheat Bread)
29 0.064658 ( Bologna)
30 0.059515 ( Bananas)
31 0.055841 ( Frozen Shrimp)
32 0.054372 ( Sandwich Bags)
33 0.060985 ( Raisins)
34 0.056576 ( Orange Juice)
35 0.052902 ( C Cell Batteries)
36 0.055841 ( Oranges)
37 0.054372 ( Mushrooms)
38 0.052168 ( Eggs, 2pct. Milk)
39 0.051433 ( 2pct. Milk, White Bread)
40 0.055107 ( Eggs, White Bread)
41 0.051433 ( White Bread, Potato Chips)
def calculate_length(df,idx):
    for i in df.index:
        df.at[i,'length_consequent'] = int(len(list(df.iloc[i,idx])))
    return df
rule_rslt_lift = association_rules(item_set, metric="lift", min_threshold=3.0) ## 0.53 / 5.6
rule_rslt_lift
antecedents consequents antecedent support consequent support support confidence lift leverage conviction
0 ( Eggs) ( 2pct. Milk) 0.122704 0.109478 0.052168 0.425150 3.883414 0.038734 1.549137
1 ( 2pct. Milk) ( Eggs) 0.109478 0.122704 0.052168 0.476510 3.883414 0.038734 1.675861
2 ( 2pct. Milk) ( White Bread) 0.109478 0.119030 0.051433 0.469799 3.946889 0.038402 1.661576
3 ( White Bread) ( 2pct. Milk) 0.119030 0.109478 0.051433 0.432099 3.946889 0.038402 1.568093
4 ( Eggs) ( White Bread) 0.122704 0.119030 0.055107 0.449102 3.773010 0.040501 1.599152
5 ( White Bread) ( Eggs) 0.119030 0.122704 0.055107 0.462963 3.773010 0.040501 1.633586
6 ( White Bread) ( Potato Chips) 0.119030 0.097722 0.051433 0.432099 4.421702 0.039801 1.588793
7 ( Potato Chips) ( White Bread) 0.097722 0.119030 0.051433 0.526316 4.421702 0.039801 1.859825
rule_rslt_lift.index
RangeIndex(start=0, stop=8, step=1)
rule_rslt_lift01 = calculate_length(rule_rslt_lift,1)
rule_rslt_lift01
antecedents consequents antecedent support consequent support support confidence lift leverage conviction length_consequent
0 ( Eggs) ( 2pct. Milk) 0.122704 0.109478 0.052168 0.425150 3.883414 0.038734 1.549137 1.0
1 ( 2pct. Milk) ( Eggs) 0.109478 0.122704 0.052168 0.476510 3.883414 0.038734 1.675861 1.0
2 ( 2pct. Milk) ( White Bread) 0.109478 0.119030 0.051433 0.469799 3.946889 0.038402 1.661576 1.0
3 ( White Bread) ( 2pct. Milk) 0.119030 0.109478 0.051433 0.432099 3.946889 0.038402 1.568093 1.0
4 ( Eggs) ( White Bread) 0.122704 0.119030 0.055107 0.449102 3.773010 0.040501 1.599152 1.0
5 ( White Bread) ( Eggs) 0.119030 0.122704 0.055107 0.462963 3.773010 0.040501 1.633586 1.0
6 ( White Bread) ( Potato Chips) 0.119030 0.097722 0.051433 0.432099 4.421702 0.039801 1.588793 1.0
7 ( Potato Chips) ( White Bread) 0.097722 0.119030 0.051433 0.526316 4.421702 0.039801 1.859825 1.0

Comments