We provide some basic statistics of the KuaiRec dataset. The description of this dataset can be referred to https://chongminggao.github.io/KuaiRec/
# If you are running locally, make sure you are in the directory of KuaiRec.
rootpath="./"
If you are using Google Colab, make sure you have added shortcut of this shared link to your own Google Drive. Then, you should load it from your space by indicating the correct path as follows.
# from google.colab import drive
# drive.mount('/content/drive')
# rootpath="./drive/MyDrive/Datasets/KuaiRec/" # Make sure this path corresponds to KuaiRec in your Drive.
import pandas as pd
print("Loading big matrix...")
big_matrix = pd.read_csv(rootpath + "data/big_matrix.csv")
print("Loading small matrix...")
small_matrix = pd.read_csv(rootpath + "data/small_matrix.csv")
print("Loading social network...")
social_network = pd.read_csv(rootpath + "data/social_network.csv")
social_network["friend_list"] = social_network["friend_list"].map(eval)
print("Loading item features...")
item_categories = pd.read_csv(rootpath + "data/item_categories.csv")
item_categories["feat"] = item_categories["feat"].map(eval)
print("Loading user features...")
user_features = pd.read_csv("data/user_features.csv")
print("Loading items' daily features...")
item_daily_features = pd.read_csv("data/item_daily_features.csv")
print("All data loaded.")
Loading big matrix... Loading small matrix... Loading social network... Loading item features... Loading user features... Loading items' daily features... All data loaded.
big_matrix
user_id | video_id | play_duration | video_duration | time | date | timestamp | watch_ratio | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3649 | 13838 | 10867 | 2020-07-05 00:08:23.438 | 20200705 | 1.593879e+09 | 1.273397 |
1 | 0 | 9598 | 13665 | 10984 | 2020-07-05 00:13:41.297 | 20200705 | 1.593879e+09 | 1.244082 |
2 | 0 | 5262 | 851 | 7908 | 2020-07-05 00:16:06.687 | 20200705 | 1.593879e+09 | 0.107613 |
3 | 0 | 1963 | 862 | 9590 | 2020-07-05 00:20:26.792 | 20200705 | 1.593880e+09 | 0.089885 |
4 | 0 | 8234 | 858 | 11000 | 2020-07-05 00:43:05.128 | 20200705 | 1.593881e+09 | 0.078000 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
12530801 | 7175 | 1281 | 34618 | 140017 | 2020-09-05 15:07:10.576 | 20200905 | 1.599290e+09 | 0.247241 |
12530802 | 7175 | 3407 | 12619 | 21888 | 2020-09-05 15:08:45.228 | 20200905 | 1.599290e+09 | 0.576526 |
12530803 | 7175 | 10360 | 2407 | 7067 | 2020-09-05 19:10:29.041 | 20200905 | 1.599304e+09 | 0.340597 |
12530804 | 7175 | 10360 | 6455 | 7067 | 2020-09-05 19:10:36.995 | 20200905 | 1.599304e+09 | 0.913400 |
12530805 | 7175 | 10389 | 12263 | 14304 | 2020-09-05 21:13:51.419 | 20200905 | 1.599312e+09 | 0.857313 |
12530806 rows × 8 columns
small_matrix
user_id | video_id | play_duration | video_duration | time | date | timestamp | watch_ratio | |
---|---|---|---|---|---|---|---|---|
0 | 14 | 148 | 4381 | 6067 | 2020-07-05 05:27:48.378 | 20200705.0 | 1.593898e+09 | 0.722103 |
1 | 14 | 183 | 11635 | 6100 | 2020-07-05 05:28:00.057 | 20200705.0 | 1.593898e+09 | 1.907377 |
2 | 14 | 3649 | 22422 | 10867 | 2020-07-05 05:29:09.479 | 20200705.0 | 1.593898e+09 | 2.063311 |
3 | 14 | 5262 | 4479 | 7908 | 2020-07-05 05:30:43.285 | 20200705.0 | 1.593898e+09 | 0.566388 |
4 | 14 | 8234 | 4602 | 11000 | 2020-07-05 05:35:43.459 | 20200705.0 | 1.593899e+09 | 0.418364 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4676565 | 7162 | 2267 | 11908 | 5467 | NaN | NaN | NaN | 2.178160 |
4676566 | 7162 | 2065 | 11919 | 6067 | NaN | NaN | NaN | 1.964562 |
4676567 | 7162 | 1296 | 16690 | 19870 | NaN | NaN | NaN | 0.839960 |
4676568 | 7162 | 4822 | 11862 | 24400 | NaN | NaN | NaN | 0.486148 |
4676569 | 7162 | 4364 | 2182 | 19367 | NaN | NaN | NaN | 0.112666 |
4676570 rows × 8 columns
item_categories
video_id | feat | |
---|---|---|
0 | 0 | [8] |
1 | 1 | [27, 9] |
2 | 2 | [9] |
3 | 3 | [26] |
4 | 4 | [5] |
... | ... | ... |
10723 | 10723 | [11] |
10724 | 10724 | [2] |
10725 | 10725 | [15] |
10726 | 10726 | [19] |
10727 | 10727 | [5] |
10728 rows × 2 columns
social_network
user_id | friend_list | |
---|---|---|
0 | 3371 | [2975] |
1 | 24 | [2665] |
2 | 4402 | [38] |
3 | 4295 | [4694] |
4 | 7087 | [7117] |
... | ... | ... |
467 | 2331 | [4345] |
468 | 6163 | [1332] |
469 | 3732 | [670] |
470 | 3335 | [202] |
471 | 5352 | [4202, 7126] |
472 rows × 2 columns
item_daily_features
video_id | date | author_id | video_type | upload_dt | upload_type | visible_status | video_duration | video_width | video_height | ... | download_cnt | download_user_num | report_cnt | report_user_num | reduce_similar_cnt | reduce_similar_user_num | collect_cnt | collect_user_num | cancel_collect_cnt | cancel_collect_user_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 20200705 | 3309 | NORMAL | 2020-03-30 | ShortImport | public | 5966.0 | 720 | 1280 | ... | 8 | 8 | 0 | 0 | 3 | 3 | NaN | NaN | NaN | NaN |
1 | 0 | 20200706 | 3309 | NORMAL | 2020-03-30 | ShortImport | public | 5966.0 | 720 | 1280 | ... | 2 | 2 | 0 | 0 | 5 | 5 | NaN | NaN | NaN | NaN |
2 | 0 | 20200707 | 3309 | NORMAL | 2020-03-30 | ShortImport | public | 5966.0 | 720 | 1280 | ... | 2 | 2 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN |
3 | 0 | 20200708 | 3309 | NORMAL | 2020-03-30 | ShortImport | public | 5966.0 | 720 | 1280 | ... | 3 | 3 | 0 | 0 | 3 | 3 | NaN | NaN | NaN | NaN |
4 | 0 | 20200709 | 3309 | NORMAL | 2020-03-30 | ShortImport | public | 5966.0 | 720 | 1280 | ... | 2 | 2 | 2 | 1 | 1 | 1 | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
343336 | 10723 | 20200905 | 236 | NORMAL | 2020-09-05 | ShortImport | public | 4833.0 | 720 | 1280 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 |
343337 | 10724 | 20200905 | 5271 | NORMAL | 2020-09-05 | LongImport | public | 54720.0 | 720 | 1280 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 |
343338 | 10725 | 20200905 | 1924 | NORMAL | 2020-09-05 | ShortImport | public | 15800.0 | 576 | 1024 | ... | 5 | 5 | 0 | 0 | 4 | 4 | 0.0 | 0.0 | 0.0 | 0.0 |
343339 | 10726 | 20200905 | 7604 | NORMAL | 2020-09-05 | ShortImport | public | 5132.0 | 528 | 960 | ... | 2 | 2 | 0 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 |
343340 | 10727 | 20200905 | 7464 | NORMAL | 2020-09-05 | ShortCamera | public | 5666.0 | 720 | 1556 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 |
343341 rows × 58 columns
user_features
user_id | user_active_degree | is_lowactive_period | is_live_streamer | is_video_author | follow_user_num | follow_user_num_range | fans_user_num | fans_user_num_range | friend_user_num | ... | onehot_feat8 | onehot_feat9 | onehot_feat10 | onehot_feat11 | onehot_feat12 | onehot_feat13 | onehot_feat14 | onehot_feat15 | onehot_feat16 | onehot_feat17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | high_active | 0 | 0 | 0 | 5 | (0,10] | 0 | 0 | 0 | ... | 184 | 6 | 3 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1 | full_active | 0 | 0 | 0 | 386 | (250,500] | 4 | [1,10) | 2 | ... | 186 | 6 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 2 | full_active | 0 | 0 | 0 | 27 | (10,50] | 0 | 0 | 0 | ... | 51 | 2 | 3 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 3 | full_active | 0 | 0 | 0 | 16 | (10,50] | 0 | 0 | 0 | ... | 251 | 3 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 4 | full_active | 0 | 0 | 0 | 122 | (100,150] | 4 | [1,10) | 0 | ... | 99 | 4 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7171 | 7171 | full_active | 0 | 0 | 1 | 52 | (50,100] | 1 | [1,10) | 0 | ... | 259 | 1 | 4 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7172 | 7172 | full_active | 0 | 0 | 0 | 45 | (10,50] | 2 | [1,10) | 2 | ... | 11 | 2 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7173 | 7173 | full_active | 0 | 0 | 0 | 615 | 500+ | 3 | [1,10) | 2 | ... | 51 | 2 | 2 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7174 | 7174 | full_active | 0 | 0 | 0 | 959 | 500+ | 0 | 0 | 0 | ... | 107 | 3 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7175 | 7175 | full_active | 0 | 0 | 1 | 98 | (100,150] | 35 | [10,100) | 33 | ... | 132 | 5 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7176 rows × 31 columns
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname="./SimHei.ttf")
def visual_continue(df, func=None):
ax = sns.distplot(df)
if func:
func(ax)
gca = plt.gca()
fig_title = "Statistics of {}".format(df.name)
gca.set_title(fig_title, fontsize=14)
gca.set_ylabel("Density", fontsize=14)
gca.set_xlabel(df.name, fontsize=14)
plt.show()
def visual_statistics_discrete(df, var="my_variable", display_ratio=True, func=None, order=None, size=(6, 4.5)):
ncount = len(df)
fig = plt.figure(figsize=size)
ax1 = fig.add_axes([0.14, 0.15, 0.74, 0.75])
sns.countplot(x=df, color="#9fc5e8", linewidth=.6, edgecolor='k', ax=ax1, order=order)
plt.grid(axis='y', linestyle='-.')
gca = plt.gca()
fig_title = "Statistics of {}".format(var)
gca.set_title(fig_title, fontsize=14)
gca.set_ylabel("Count", fontsize=14)
gca.set_xlabel(var, fontsize=14)
if func:
func(ax1)
if display_ratio:
# Make twin axis
ax2 = ax1.twinx()
ax2.set_ylabel("ratio (%)", fontsize=14)
for p in ax1.patches:
x = p.get_bbox().get_points()[:, 0]
y = p.get_bbox().get_points()[1, 1]
ax1.annotate('{:.1f}%'.format(100. * y / ncount), (x.mean(), y),
ha='center', va='bottom', fontsize=10, rotation=30) # set the alignment of the text
ax2.set_ylim(0, ax1.get_ylim()[1] / ncount * 100)
plt.show()
import warnings; warnings.simplefilter('ignore')
print(social_network.friend_list.map(len).describe())
visual_statistics_discrete(social_network.friend_list.map(len), "number of friends")
count 472.000000 mean 1.419492 std 0.853295 min 1.000000 25% 1.000000 50% 1.000000 75% 2.000000 max 5.000000 Name: friend_list, dtype: float64
num_feat = item_categories.feat.map(len)
print(num_feat.describe())
visual_statistics_discrete(num_feat, "number of tags")
count 10728.000000 mean 1.183166 std 0.436205 min 1.000000 25% 1.000000 50% 1.000000 75% 1.000000 max 4.000000 Name: feat, dtype: float64
import collections
import itertools
cnt = item_categories.feat.map(collections.Counter)
cnt_all = collections.Counter()
for d in cnt:
cnt_all.update(d)
# print(dict(cnt_all))
all_feat = pd.Series(sorted(list(itertools.chain.from_iterable([[i]*k for i,k in cnt_all.items()]))),name="feat")
# print(all_feat)
visual_statistics_discrete(all_feat, "tag", size=(12,4.5))
big_watch_ratio = big_matrix.watch_ratio[big_matrix.watch_ratio <= 5]
print(big_watch_ratio.describe())
visual_continue(big_watch_ratio)
count 1.241552e+07 mean 8.510664e-01 std 7.185296e-01 min 0.000000e+00 25% 3.111008e-01 50% 7.161066e-01 75% 1.161843e+00 max 5.000000e+00 Name: watch_ratio, dtype: float64
small_watch_ratio = small_matrix.watch_ratio[small_matrix.watch_ratio <= 5]
print(small_watch_ratio.describe())
visual_continue(small_watch_ratio)
count 4.653780e+06 mean 8.553870e-01 std 5.952060e-01 min 0.000000e+00 25% 4.657859e-01 50% 7.662410e-01 75% 1.114060e+00 max 5.000000e+00 Name: watch_ratio, dtype: float64
big_video_duration = big_matrix.video_duration
print(big_video_duration.describe())
# visual_continue(big_video_duration)
visual_continue(big_video_duration[big_video_duration < 100000])
count 1.253081e+07 mean 1.462157e+04 std 1.983474e+04 min 1.400000e+02 25% 7.434000e+03 50% 9.636000e+03 75% 1.217900e+04 max 3.150720e+05 Name: video_duration, dtype: float64
small_video_duration = small_matrix.video_duration
print(small_video_duration.describe())
# visual_continue(small_video_duration)
visual_continue(small_video_duration[small_video_duration < 100000])
count 4.676570e+06 mean 1.448645e+04 std 2.046711e+04 min 3.067000e+03 25% 7.523000e+03 50% 9.600000e+03 75% 1.193400e+04 max 3.150720e+05 Name: video_duration, dtype: float64
big_play_time = big_matrix.groupby('user_id').agg({"date":len})
big_play_time.name = "play times"
print(big_play_time.describe())
visual_continue(big_play_time)
date count 7176.000000 mean 1746.210424 std 991.832222 min 100.000000 25% 883.000000 50% 1846.500000 75% 2461.000000 max 16015.000000
small_play_time = small_matrix.groupby('user_id').agg({"date":len})
small_play_time.name = "play times"
print(small_play_time.describe())
visual_continue(small_play_time)
date count 1411.000000 mean 3314.365698 std 6.984852 min 3295.000000 25% 3309.000000 50% 3315.000000 75% 3320.000000 max 3327.000000
big_daily_play_time = big_matrix.groupby(['user_id', 'date']).size()
big_daily_play_time.name = "play times"
print(big_daily_play_time.describe())
visual_continue(big_daily_play_time)
count 188322.000000 mean 66.539257 std 78.752240 min 1.000000 25% 19.000000 50% 42.000000 75% 83.000000 max 3268.000000 Name: play times, dtype: float64
small_daily_play_time = small_matrix.groupby(['user_id', 'date']).size()
small_daily_play_time.name = "play times"
print(small_daily_play_time.describe())
visual_continue(small_daily_play_time)
count 86671.000000 mean 51.857922 std 32.608372 min 1.000000 25% 28.000000 50% 47.000000 75% 70.000000 max 402.000000 Name: play times, dtype: float64