Association Rule more using python
기존에 했던, mlstend 에서 추가기능 활용을 하기 위한 포스팅이다.
import pandas as pd
import numpy as np
import itertools ## 조합만들때, 필요하다.
mdf = pd.read_csv('D:/★2020_ML_DL_Project/Alchemy/dataset/marketbasket.csv',encoding='UTF8',header='infer')
print(mdf.shape) ## 트랜잭션 수는 1361 건이다.
print(mdf.columns) ## 품목이 255건이나 된다.
mdf.head()
(315, 7)
Index(['0', '1', '2', '3', '4', '5', '6'], dtype='object')
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | |
|---|---|---|---|---|---|---|---|
| 0 | Bread | Wine | Eggs | Meat | Cheese | Pencil | Diaper |
| 1 | Bread | Cheese | Meat | Diaper | Wine | Milk | Pencil |
| 2 | Cheese | Meat | Eggs | Milk | Wine | NaN | NaN |
| 3 | Cheese | Meat | Eggs | Milk | Wine | NaN | NaN |
| 4 | Meat | Pencil | Wine | NaN | NaN | NaN | NaN |
new_columns = mdf.columns.str.strip().to_list()
mdf.columns = new_columns
mdf.head(2)
| Hair Conditioner | Lemons | Standard coffee | Frozen Chicken Wings | 98pct. Fat Free Hamburger | Sugar Cookies | Onions | Deli Ham | Dishwasher Detergent | Beets | ... | Lollipops | Plain White Bread | Blueberry Yogurt | Frozen Chicken Thighs | Mixed Vegetables | Souring Pads | Tuna Spread | Toilet Paper | White Wine | Columbian Coffee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 255 columns
mlxtend 활용
from tqdm import tqdm
tqdm.pandas()
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules,fpgrowth
TransactionEncoder
- 연관성분석 하기 좋은 데이터셋으로, 변환시켜주는 라이브러리
cust_id,[item01,item02,item03 ~ item100] 처럼 만들어준다.
본 데이터셋에서는 이미, 변환이 되어 있기 때문에 필요없다. [2차원 list or array 를 형태를 받아서, 변환시킨다.]
te = TransactionEncoder() 예시
te_rslt = te.fit(mdf_lst).transform(mdf_lst) 예시
mdf.shape
(1361, 255)
item_set = fpgrowth(mdf,min_support=0.05,use_colnames=True)
item_set
| support | itemsets | |
|---|---|---|
| 0 | 0.055841 | ( Plums) |
| 1 | 0.057311 | ( Pancake Mix) |
| 2 | 0.109478 | ( 2pct. Milk) |
| 3 | 0.119030 | ( White Bread) |
| 4 | 0.097722 | ( Potato Chips) |
| 5 | 0.093314 | ( 98pct. Fat Free Hamburger) |
| 6 | 0.074210 | ( Toilet Paper) |
| 7 | 0.080088 | ( Onions) |
| 8 | 0.071271 | ( Hamburger Buns) |
| 9 | 0.058780 | ( French Fries) |
| 10 | 0.055107 | ( Sugar Cookies) |
| 11 | 0.092579 | ( Hot Dogs) |
| 12 | 0.067597 | ( Domestic Beer) |
| 13 | 0.063924 | ( Popcorn Salt) |
| 14 | 0.058780 | ( Hair Conditioner) |
| 15 | 0.051433 | ( Waffles) |
| 16 | 0.122704 | ( Eggs) |
| 17 | 0.085231 | ( Sweet Relish) |
| 18 | 0.079353 | ( Toothpaste) |
| 19 | 0.066128 | ( Tomatoes) |
| 20 | 0.054372 | ( Canned Tuna) |
| 21 | 0.053637 | ( Apples) |
| 22 | 0.050698 | ( Sour Cream) |
| 23 | 0.077884 | ( Cola) |
| 24 | 0.069067 | ( Pepperoni Pizza - Frozen) |
| 25 | 0.062454 | ( Ramen Noodles) |
| 26 | 0.058780 | ( Hot Dog Buns) |
| 27 | 0.057311 | ( Garlic) |
| 28 | 0.077149 | ( Wheat Bread) |
| 29 | 0.064658 | ( Bologna) |
| 30 | 0.059515 | ( Bananas) |
| 31 | 0.055841 | ( Frozen Shrimp) |
| 32 | 0.054372 | ( Sandwich Bags) |
| 33 | 0.060985 | ( Raisins) |
| 34 | 0.056576 | ( Orange Juice) |
| 35 | 0.052902 | ( C Cell Batteries) |
| 36 | 0.055841 | ( Oranges) |
| 37 | 0.054372 | ( Mushrooms) |
| 38 | 0.052168 | ( Eggs, 2pct. Milk) |
| 39 | 0.051433 | ( 2pct. Milk, White Bread) |
| 40 | 0.055107 | ( Eggs, White Bread) |
| 41 | 0.051433 | ( White Bread, Potato Chips) |
def calculate_length(df,idx):
for i in df.index:
df.at[i,'length_consequent'] = int(len(list(df.iloc[i,idx])))
return df
rule_rslt_lift = association_rules(item_set, metric="lift", min_threshold=3.0) ## 0.53 / 5.6
rule_rslt_lift
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ( Eggs) | ( 2pct. Milk) | 0.122704 | 0.109478 | 0.052168 | 0.425150 | 3.883414 | 0.038734 | 1.549137 |
| 1 | ( 2pct. Milk) | ( Eggs) | 0.109478 | 0.122704 | 0.052168 | 0.476510 | 3.883414 | 0.038734 | 1.675861 |
| 2 | ( 2pct. Milk) | ( White Bread) | 0.109478 | 0.119030 | 0.051433 | 0.469799 | 3.946889 | 0.038402 | 1.661576 |
| 3 | ( White Bread) | ( 2pct. Milk) | 0.119030 | 0.109478 | 0.051433 | 0.432099 | 3.946889 | 0.038402 | 1.568093 |
| 4 | ( Eggs) | ( White Bread) | 0.122704 | 0.119030 | 0.055107 | 0.449102 | 3.773010 | 0.040501 | 1.599152 |
| 5 | ( White Bread) | ( Eggs) | 0.119030 | 0.122704 | 0.055107 | 0.462963 | 3.773010 | 0.040501 | 1.633586 |
| 6 | ( White Bread) | ( Potato Chips) | 0.119030 | 0.097722 | 0.051433 | 0.432099 | 4.421702 | 0.039801 | 1.588793 |
| 7 | ( Potato Chips) | ( White Bread) | 0.097722 | 0.119030 | 0.051433 | 0.526316 | 4.421702 | 0.039801 | 1.859825 |
rule_rslt_lift.index
RangeIndex(start=0, stop=8, step=1)
rule_rslt_lift01 = calculate_length(rule_rslt_lift,1)
rule_rslt_lift01
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | length_consequent | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ( Eggs) | ( 2pct. Milk) | 0.122704 | 0.109478 | 0.052168 | 0.425150 | 3.883414 | 0.038734 | 1.549137 | 1.0 |
| 1 | ( 2pct. Milk) | ( Eggs) | 0.109478 | 0.122704 | 0.052168 | 0.476510 | 3.883414 | 0.038734 | 1.675861 | 1.0 |
| 2 | ( 2pct. Milk) | ( White Bread) | 0.109478 | 0.119030 | 0.051433 | 0.469799 | 3.946889 | 0.038402 | 1.661576 | 1.0 |
| 3 | ( White Bread) | ( 2pct. Milk) | 0.119030 | 0.109478 | 0.051433 | 0.432099 | 3.946889 | 0.038402 | 1.568093 | 1.0 |
| 4 | ( Eggs) | ( White Bread) | 0.122704 | 0.119030 | 0.055107 | 0.449102 | 3.773010 | 0.040501 | 1.599152 | 1.0 |
| 5 | ( White Bread) | ( Eggs) | 0.119030 | 0.122704 | 0.055107 | 0.462963 | 3.773010 | 0.040501 | 1.633586 | 1.0 |
| 6 | ( White Bread) | ( Potato Chips) | 0.119030 | 0.097722 | 0.051433 | 0.432099 | 4.421702 | 0.039801 | 1.588793 | 1.0 |
| 7 | ( Potato Chips) | ( White Bread) | 0.097722 | 0.119030 | 0.051433 | 0.526316 | 4.421702 | 0.039801 | 1.859825 | 1.0 |
Comments