Association Rule more using python
기존에 했던, mlstend 에서 추가기능 활용을 하기 위한 포스팅이다.
import pandas as pd
import numpy as np
import itertools ## 조합만들때, 필요하다.
mdf = pd.read_csv('D:/★2020_ML_DL_Project/Alchemy/dataset/marketbasket.csv',encoding='UTF8',header='infer')
print(mdf.shape) ## 트랜잭션 수는 1361 건이다.
print(mdf.columns) ## 품목이 255건이나 된다.
mdf.head()
(315, 7)
Index(['0', '1', '2', '3', '4', '5', '6'], dtype='object')
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | Bread | Wine | Eggs | Meat | Cheese | Pencil | Diaper |
1 | Bread | Cheese | Meat | Diaper | Wine | Milk | Pencil |
2 | Cheese | Meat | Eggs | Milk | Wine | NaN | NaN |
3 | Cheese | Meat | Eggs | Milk | Wine | NaN | NaN |
4 | Meat | Pencil | Wine | NaN | NaN | NaN | NaN |
new_columns = mdf.columns.str.strip().to_list()
mdf.columns = new_columns
mdf.head(2)
Hair Conditioner | Lemons | Standard coffee | Frozen Chicken Wings | 98pct. Fat Free Hamburger | Sugar Cookies | Onions | Deli Ham | Dishwasher Detergent | Beets | ... | Lollipops | Plain White Bread | Blueberry Yogurt | Frozen Chicken Thighs | Mixed Vegetables | Souring Pads | Tuna Spread | Toilet Paper | White Wine | Columbian Coffee | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 255 columns
mlxtend 활용
from tqdm import tqdm
tqdm.pandas()
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules,fpgrowth
TransactionEncoder
- 연관성분석 하기 좋은 데이터셋으로, 변환시켜주는 라이브러리
cust_id,[item01,item02,item03 ~ item100] 처럼 만들어준다.
본 데이터셋에서는 이미, 변환이 되어 있기 때문에 필요없다. [2차원 list or array 를 형태를 받아서, 변환시킨다.]
te = TransactionEncoder() 예시
te_rslt = te.fit(mdf_lst).transform(mdf_lst) 예시
mdf.shape
(1361, 255)
item_set = fpgrowth(mdf,min_support=0.05,use_colnames=True)
item_set
support | itemsets | |
---|---|---|
0 | 0.055841 | ( Plums) |
1 | 0.057311 | ( Pancake Mix) |
2 | 0.109478 | ( 2pct. Milk) |
3 | 0.119030 | ( White Bread) |
4 | 0.097722 | ( Potato Chips) |
5 | 0.093314 | ( 98pct. Fat Free Hamburger) |
6 | 0.074210 | ( Toilet Paper) |
7 | 0.080088 | ( Onions) |
8 | 0.071271 | ( Hamburger Buns) |
9 | 0.058780 | ( French Fries) |
10 | 0.055107 | ( Sugar Cookies) |
11 | 0.092579 | ( Hot Dogs) |
12 | 0.067597 | ( Domestic Beer) |
13 | 0.063924 | ( Popcorn Salt) |
14 | 0.058780 | ( Hair Conditioner) |
15 | 0.051433 | ( Waffles) |
16 | 0.122704 | ( Eggs) |
17 | 0.085231 | ( Sweet Relish) |
18 | 0.079353 | ( Toothpaste) |
19 | 0.066128 | ( Tomatoes) |
20 | 0.054372 | ( Canned Tuna) |
21 | 0.053637 | ( Apples) |
22 | 0.050698 | ( Sour Cream) |
23 | 0.077884 | ( Cola) |
24 | 0.069067 | ( Pepperoni Pizza - Frozen) |
25 | 0.062454 | ( Ramen Noodles) |
26 | 0.058780 | ( Hot Dog Buns) |
27 | 0.057311 | ( Garlic) |
28 | 0.077149 | ( Wheat Bread) |
29 | 0.064658 | ( Bologna) |
30 | 0.059515 | ( Bananas) |
31 | 0.055841 | ( Frozen Shrimp) |
32 | 0.054372 | ( Sandwich Bags) |
33 | 0.060985 | ( Raisins) |
34 | 0.056576 | ( Orange Juice) |
35 | 0.052902 | ( C Cell Batteries) |
36 | 0.055841 | ( Oranges) |
37 | 0.054372 | ( Mushrooms) |
38 | 0.052168 | ( Eggs, 2pct. Milk) |
39 | 0.051433 | ( 2pct. Milk, White Bread) |
40 | 0.055107 | ( Eggs, White Bread) |
41 | 0.051433 | ( White Bread, Potato Chips) |
def calculate_length(df,idx):
for i in df.index:
df.at[i,'length_consequent'] = int(len(list(df.iloc[i,idx])))
return df
rule_rslt_lift = association_rules(item_set, metric="lift", min_threshold=3.0) ## 0.53 / 5.6
rule_rslt_lift
antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | |
---|---|---|---|---|---|---|---|---|---|
0 | ( Eggs) | ( 2pct. Milk) | 0.122704 | 0.109478 | 0.052168 | 0.425150 | 3.883414 | 0.038734 | 1.549137 |
1 | ( 2pct. Milk) | ( Eggs) | 0.109478 | 0.122704 | 0.052168 | 0.476510 | 3.883414 | 0.038734 | 1.675861 |
2 | ( 2pct. Milk) | ( White Bread) | 0.109478 | 0.119030 | 0.051433 | 0.469799 | 3.946889 | 0.038402 | 1.661576 |
3 | ( White Bread) | ( 2pct. Milk) | 0.119030 | 0.109478 | 0.051433 | 0.432099 | 3.946889 | 0.038402 | 1.568093 |
4 | ( Eggs) | ( White Bread) | 0.122704 | 0.119030 | 0.055107 | 0.449102 | 3.773010 | 0.040501 | 1.599152 |
5 | ( White Bread) | ( Eggs) | 0.119030 | 0.122704 | 0.055107 | 0.462963 | 3.773010 | 0.040501 | 1.633586 |
6 | ( White Bread) | ( Potato Chips) | 0.119030 | 0.097722 | 0.051433 | 0.432099 | 4.421702 | 0.039801 | 1.588793 |
7 | ( Potato Chips) | ( White Bread) | 0.097722 | 0.119030 | 0.051433 | 0.526316 | 4.421702 | 0.039801 | 1.859825 |
rule_rslt_lift.index
RangeIndex(start=0, stop=8, step=1)
rule_rslt_lift01 = calculate_length(rule_rslt_lift,1)
rule_rslt_lift01
antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | length_consequent | |
---|---|---|---|---|---|---|---|---|---|---|
0 | ( Eggs) | ( 2pct. Milk) | 0.122704 | 0.109478 | 0.052168 | 0.425150 | 3.883414 | 0.038734 | 1.549137 | 1.0 |
1 | ( 2pct. Milk) | ( Eggs) | 0.109478 | 0.122704 | 0.052168 | 0.476510 | 3.883414 | 0.038734 | 1.675861 | 1.0 |
2 | ( 2pct. Milk) | ( White Bread) | 0.109478 | 0.119030 | 0.051433 | 0.469799 | 3.946889 | 0.038402 | 1.661576 | 1.0 |
3 | ( White Bread) | ( 2pct. Milk) | 0.119030 | 0.109478 | 0.051433 | 0.432099 | 3.946889 | 0.038402 | 1.568093 | 1.0 |
4 | ( Eggs) | ( White Bread) | 0.122704 | 0.119030 | 0.055107 | 0.449102 | 3.773010 | 0.040501 | 1.599152 | 1.0 |
5 | ( White Bread) | ( Eggs) | 0.119030 | 0.122704 | 0.055107 | 0.462963 | 3.773010 | 0.040501 | 1.633586 | 1.0 |
6 | ( White Bread) | ( Potato Chips) | 0.119030 | 0.097722 | 0.051433 | 0.432099 | 4.421702 | 0.039801 | 1.588793 | 1.0 |
7 | ( Potato Chips) | ( White Bread) | 0.097722 | 0.119030 | 0.051433 | 0.526316 | 4.421702 | 0.039801 | 1.859825 | 1.0 |
Comments