# If you are running locally, make sure you are in the directory of KuaiRec.
rootpath="./"


# from google.colab import drive
# drive.mount('/content/drive')

# rootpath="./drive/MyDrive/Datasets/KuaiRec/" # Make sure this path corresponds to KuaiRec in your Drive.


import pandas as pd

print("Loading big matrix...")
big_matrix = pd.read_csv(rootpath + "data/big_matrix.csv")
print("Loading small matrix...")
small_matrix = pd.read_csv(rootpath + "data/small_matrix.csv")

print("Loading social network...")
social_network = pd.read_csv(rootpath + "data/social_network.csv")
social_network["friend_list"] = social_network["friend_list"].map(eval)

print("Loading item features...")
item_categories = pd.read_csv(rootpath + "data/item_categories.csv")
item_categories["feat"] = item_categories["feat"].map(eval)

print("Loading user features...")
user_features = pd.read_csv("data/user_features.csv")

print("Loading items' daily features...")
item_daily_features = pd.read_csv("data/item_daily_features.csv")

print("All data loaded.")

Loading big matrix...
Loading small matrix...
Loading social network...
Loading item features...
Loading user features...
Loading items' daily features...
All data loaded.


big_matrix


small_matrix


item_categories


social_network


item_daily_features


user_features


import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname="./SimHei.ttf")

def visual_continue(df, func=None):
    ax = sns.distplot(df)
    if func:
        func(ax)
    
    gca = plt.gca()
    fig_title = "Statistics of {}".format(df.name)
    gca.set_title(fig_title, fontsize=14)
    gca.set_ylabel("Density", fontsize=14)
    gca.set_xlabel(df.name, fontsize=14)
    
    plt.show()

def visual_statistics_discrete(df, var="my_variable", display_ratio=True, func=None, order=None, size=(6, 4.5)):
    ncount = len(df)

    fig = plt.figure(figsize=size)
    ax1 = fig.add_axes([0.14, 0.15, 0.74, 0.75])
    sns.countplot(x=df, color="#9fc5e8", linewidth=.6, edgecolor='k', ax=ax1, order=order)


    plt.grid(axis='y', linestyle='-.')

    gca = plt.gca()
    fig_title = "Statistics of {}".format(var)
    gca.set_title(fig_title, fontsize=14)
    gca.set_ylabel("Count", fontsize=14)
    gca.set_xlabel(var, fontsize=14)
    
    if func:
        func(ax1)

    if display_ratio:
        # Make twin axis
        ax2 = ax1.twinx()
        ax2.set_ylabel("ratio (%)", fontsize=14)


        for p in ax1.patches:
            x = p.get_bbox().get_points()[:, 0]
            y = p.get_bbox().get_points()[1, 1]
            ax1.annotate('{:.1f}%'.format(100. * y / ncount), (x.mean(), y),
                         ha='center', va='bottom', fontsize=10, rotation=30)  # set the alignment of the text

        ax2.set_ylim(0, ax1.get_ylim()[1] / ncount * 100)

    plt.show()


import warnings; warnings.simplefilter('ignore')


print(social_network.friend_list.map(len).describe())
visual_statistics_discrete(social_network.friend_list.map(len), "number of friends")

count    472.000000
mean       1.419492
std        0.853295
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: friend_list, dtype: float64


num_feat = item_categories.feat.map(len)
print(num_feat.describe())
visual_statistics_discrete(num_feat, "number of tags")

count    10728.000000
mean         1.183166
std          0.436205
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          4.000000
Name: feat, dtype: float64


import collections
import itertools

cnt = item_categories.feat.map(collections.Counter)
cnt_all = collections.Counter()
for d in cnt:
    cnt_all.update(d)
# print(dict(cnt_all))
all_feat = pd.Series(sorted(list(itertools.chain.from_iterable([[i]*k for i,k in cnt_all.items()]))),name="feat")
# print(all_feat)
visual_statistics_discrete(all_feat, "tag", size=(12,4.5))


big_watch_ratio = big_matrix.watch_ratio[big_matrix.watch_ratio <= 5]
print(big_watch_ratio.describe())
visual_continue(big_watch_ratio)

count    1.241552e+07
mean     8.510664e-01
std      7.185296e-01
min      0.000000e+00
25%      3.111008e-01
50%      7.161066e-01
75%      1.161843e+00
max      5.000000e+00
Name: watch_ratio, dtype: float64


small_watch_ratio = small_matrix.watch_ratio[small_matrix.watch_ratio <= 5]
print(small_watch_ratio.describe())
visual_continue(small_watch_ratio)

count    4.653780e+06
mean     8.553870e-01
std      5.952060e-01
min      0.000000e+00
25%      4.657859e-01
50%      7.662410e-01
75%      1.114060e+00
max      5.000000e+00
Name: watch_ratio, dtype: float64


big_video_duration = big_matrix.video_duration
print(big_video_duration.describe())
# visual_continue(big_video_duration)
visual_continue(big_video_duration[big_video_duration < 100000])

count    1.253081e+07
mean     1.462157e+04
std      1.983474e+04
min      1.400000e+02
25%      7.434000e+03
50%      9.636000e+03
75%      1.217900e+04
max      3.150720e+05
Name: video_duration, dtype: float64


small_video_duration = small_matrix.video_duration
print(small_video_duration.describe())
# visual_continue(small_video_duration)
visual_continue(small_video_duration[small_video_duration < 100000])

count    4.676570e+06
mean     1.448645e+04
std      2.046711e+04
min      3.067000e+03
25%      7.523000e+03
50%      9.600000e+03
75%      1.193400e+04
max      3.150720e+05
Name: video_duration, dtype: float64


big_play_time = big_matrix.groupby('user_id').agg({"date":len})
big_play_time.name = "play times"
print(big_play_time.describe())
visual_continue(big_play_time)

               date
count   7176.000000
mean    1746.210424
std      991.832222
min      100.000000
25%      883.000000
50%     1846.500000
75%     2461.000000
max    16015.000000


small_play_time = small_matrix.groupby('user_id').agg({"date":len})
small_play_time.name = "play times"
print(small_play_time.describe())
visual_continue(small_play_time)

              date
count  1411.000000
mean   3314.365698
std       6.984852
min    3295.000000
25%    3309.000000
50%    3315.000000
75%    3320.000000
max    3327.000000


big_daily_play_time = big_matrix.groupby(['user_id', 'date']).size()
big_daily_play_time.name = "play times"
print(big_daily_play_time.describe())
visual_continue(big_daily_play_time)

count    188322.000000
mean         66.539257
std          78.752240
min           1.000000
25%          19.000000
50%          42.000000
75%          83.000000
max        3268.000000
Name: play times, dtype: float64


small_daily_play_time = small_matrix.groupby(['user_id', 'date']).size()
small_daily_play_time.name = "play times"
print(small_daily_play_time.describe())
visual_continue(small_daily_play_time)

count    86671.000000
mean        51.857922
std         32.608372
min          1.000000
25%         28.000000
50%         47.000000
75%         70.000000
max        402.000000
Name: play times, dtype: float64

	user_id	video_id	play_duration	video_duration	time	date	timestamp	watch_ratio
0	0	3649	13838	10867	2020-07-05 00:08:23.438	20200705	1.593879e+09	1.273397
1	0	9598	13665	10984	2020-07-05 00:13:41.297	20200705	1.593879e+09	1.244082
2	0	5262	851	7908	2020-07-05 00:16:06.687	20200705	1.593879e+09	0.107613
3	0	1963	862	9590	2020-07-05 00:20:26.792	20200705	1.593880e+09	0.089885
4	0	8234	858	11000	2020-07-05 00:43:05.128	20200705	1.593881e+09	0.078000
...	...	...	...	...	...	...	...	...
12530801	7175	1281	34618	140017	2020-09-05 15:07:10.576	20200905	1.599290e+09	0.247241
12530802	7175	3407	12619	21888	2020-09-05 15:08:45.228	20200905	1.599290e+09	0.576526
12530803	7175	10360	2407	7067	2020-09-05 19:10:29.041	20200905	1.599304e+09	0.340597
12530804	7175	10360	6455	7067	2020-09-05 19:10:36.995	20200905	1.599304e+09	0.913400
12530805	7175	10389	12263	14304	2020-09-05 21:13:51.419	20200905	1.599312e+09	0.857313

	user_id	video_id	play_duration	video_duration	time	date	timestamp	watch_ratio
0	14	148	4381	6067	2020-07-05 05:27:48.378	20200705.0	1.593898e+09	0.722103
1	14	183	11635	6100	2020-07-05 05:28:00.057	20200705.0	1.593898e+09	1.907377
2	14	3649	22422	10867	2020-07-05 05:29:09.479	20200705.0	1.593898e+09	2.063311
3	14	5262	4479	7908	2020-07-05 05:30:43.285	20200705.0	1.593898e+09	0.566388
4	14	8234	4602	11000	2020-07-05 05:35:43.459	20200705.0	1.593899e+09	0.418364
...	...	...	...	...	...	...	...	...
4676565	7162	2267	11908	5467	NaN	NaN	NaN	2.178160
4676566	7162	2065	11919	6067	NaN	NaN	NaN	1.964562
4676567	7162	1296	16690	19870	NaN	NaN	NaN	0.839960
4676568	7162	4822	11862	24400	NaN	NaN	NaN	0.486148
4676569	7162	4364	2182	19367	NaN	NaN	NaN	0.112666

	video_id	feat
0	0	[8]
1	1	[27, 9]
2	2	[9]
3	3	[26]
4	4	[5]
...	...	...
10723	10723	[11]
10724	10724	[2]
10725	10725	[15]
10726	10726	[19]
10727	10727	[5]

	user_id	friend_list
0	3371	[2975]
1	24	[2665]
2	4402	[38]
3	4295	[4694]
4	7087	[7117]
...	...	...
467	2331	[4345]
468	6163	[1332]
469	3732	[670]
470	3335	[202]
471	5352	[4202, 7126]

	video_id	date	author_id	video_type	upload_dt	upload_type	visible_status	video_duration	video_width	video_height	...	download_cnt	download_user_num	report_cnt	report_user_num	reduce_similar_cnt	reduce_similar_user_num	collect_cnt	collect_user_num	cancel_collect_cnt	cancel_collect_user_num
0	0	20200705	3309	NORMAL	2020-03-30	ShortImport	public	5966.0	720	1280	...	8	8	0	0	3	3	NaN	NaN	NaN	NaN
1	0	20200706	3309	NORMAL	2020-03-30	ShortImport	public	5966.0	720	1280	...	2	2	0	0	5	5	NaN	NaN	NaN	NaN
2	0	20200707	3309	NORMAL	2020-03-30	ShortImport	public	5966.0	720	1280	...	2	2	0	0	0	0	NaN	NaN	NaN	NaN
3	0	20200708	3309	NORMAL	2020-03-30	ShortImport	public	5966.0	720	1280	...	3	3	0	0	3	3	NaN	NaN	NaN	NaN
4	0	20200709	3309	NORMAL	2020-03-30	ShortImport	public	5966.0	720	1280	...	2	2	2	1	1	1	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
343336	10723	20200905	236	NORMAL	2020-09-05	ShortImport	public	4833.0	720	1280	...	0	0	0	0	0	0	0.0	0.0	0.0	0.0
343337	10724	20200905	5271	NORMAL	2020-09-05	LongImport	public	54720.0	720	1280	...	1	1	0	0	0	0	0.0	0.0	0.0	0.0
343338	10725	20200905	1924	NORMAL	2020-09-05	ShortImport	public	15800.0	576	1024	...	5	5	0	0	4	4	0.0	0.0	0.0	0.0
343339	10726	20200905	7604	NORMAL	2020-09-05	ShortImport	public	5132.0	528	960	...	2	2	0	0	1	1	0.0	0.0	0.0	0.0
343340	10727	20200905	7464	NORMAL	2020-09-05	ShortCamera	public	5666.0	720	1556	...	0	0	0	0	0	0	0.0	0.0	0.0	0.0

Statistics of KuaiRec¶

Load data¶

Visualization of the four tables¶

Codes for visualization¶

Statistics of video features¶

Distribution of the 31 tags of items¶

Distribution of watch_ratio in big matrix¶

Distribution of watch_ratio in small matrix¶

Distribution of video duration in the big matrix (in millisecond)¶

Distribution of video duration in the small matrix (in millisecond)¶

Distribution of each user's total play times in the big matrix¶

Distribution of each user's total play times in the small matrix¶

Distribution of each user's daily play times in the big matrix¶

Distribution of each user's daily play times in the small matrix¶

	user_id	user_active_degree	is_lowactive_period	is_live_streamer	is_video_author	follow_user_num	follow_user_num_range	fans_user_num	fans_user_num_range	friend_user_num	...	onehot_feat8	onehot_feat9	onehot_feat10	onehot_feat11	onehot_feat12	onehot_feat13	onehot_feat14	onehot_feat15	onehot_feat16	onehot_feat17
0	0	high_active	0	0	0	5	(0,10]	0	0	0	...	184	6	3	0	0.0	0.0	0.0	0.0	0.0	0.0
1	1	full_active	0	0	0	386	(250,500]	4	[1,10)	2	...	186	6	2	0	0.0	0.0	0.0	0.0	0.0	0.0
2	2	full_active	0	0	0	27	(10,50]	0	0	0	...	51	2	3	0	0.0	0.0	0.0	0.0	0.0	0.0
3	3	full_active	0	0	0	16	(10,50]	0	0	0	...	251	3	2	0	0.0	0.0	0.0	0.0	0.0	0.0
4	4	full_active	0	0	0	122	(100,150]	4	[1,10)	0	...	99	4	2	0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7171	7171	full_active	0	0	1	52	(50,100]	1	[1,10)	0	...	259	1	4	0	1.0	0.0	0.0	0.0	0.0	0.0
7172	7172	full_active	0	0	0	45	(10,50]	2	[1,10)	2	...	11	2	0	0	1.0	0.0	0.0	0.0	0.0	0.0
7173	7173	full_active	0	0	0	615	500+	3	[1,10)	2	...	51	2	2	0	1.0	0.0	0.0	0.0	0.0	0.0
7174	7174	full_active	0	0	0	959	500+	0	0	0	...	107	3	2	0	0.0	0.0	0.0	0.0	0.0	0.0
7175	7175	full_active	0	0	1	98	(100,150]	35	[10,100)	33	...	132	5	2	0	0.0	0.0	0.0	0.0	0.0	0.0

Statistics of KuaiRec¶

Load data¶

Visualization of the four tables¶

Codes for visualization¶

Statistics of social network¶

Statistics of video features¶

Distribution of the 31 tags of items¶

Distribution of watch_ratio in big matrix¶

Distribution of watch_ratio in small matrix¶

Distribution of video duration in the big matrix (in millisecond)¶

Distribution of video duration in the small matrix (in millisecond)¶

Distribution of each user's total play times in the big matrix¶

Distribution of each user's total play times in the small matrix¶

Distribution of each user's daily play times in the big matrix¶

Distribution of each user's daily play times in the small matrix¶