Spaces:
Build error
Build error
Commit
Β·
4871b85
1
Parent(s):
f1f0527
Reorganize data
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- data.py +138 -113
- game.csv β data/2024/game.csv +0 -0
- {pa β data/2024/pa}/2021019999.csv +0 -0
- {pa β data/2024/pa}/2021020000.csv +0 -0
- {pa β data/2024/pa}/2021020001.csv +0 -0
- {pa β data/2024/pa}/2021020002.csv +0 -0
- {pa β data/2024/pa}/2021020003.csv +0 -0
- {pa β data/2024/pa}/2021020004.csv +0 -0
- {pa β data/2024/pa}/2021020005.csv +0 -0
- {pa β data/2024/pa}/2021020006.csv +0 -0
- {pa β data/2024/pa}/2021020007.csv +0 -0
- {pa β data/2024/pa}/2021020008.csv +0 -0
- {pa β data/2024/pa}/2021020009.csv +0 -0
- {pa β data/2024/pa}/2021020010.csv +0 -0
- {pa β data/2024/pa}/2021020011.csv +0 -0
- {pa β data/2024/pa}/2021020012.csv +0 -0
- {pa β data/2024/pa}/2021020013.csv +0 -0
- {pa β data/2024/pa}/2021020014.csv +0 -0
- {pa β data/2024/pa}/2021020015.csv +0 -0
- {pa β data/2024/pa}/2021020016.csv +0 -0
- {pa β data/2024/pa}/2021020017.csv +0 -0
- {pa β data/2024/pa}/2021020018.csv +0 -0
- {pa β data/2024/pa}/2021020019.csv +0 -0
- {pa β data/2024/pa}/2021020020.csv +0 -0
- {pa β data/2024/pa}/2021020021.csv +0 -0
- {pa β data/2024/pa}/2021020022.csv +0 -0
- {pa β data/2024/pa}/2021020023.csv +0 -0
- {pa β data/2024/pa}/2021020024.csv +0 -0
- {pa β data/2024/pa}/2021020026.csv +0 -0
- {pa β data/2024/pa}/2021020027.csv +0 -0
- {pa β data/2024/pa}/2021020028.csv +0 -0
- {pa β data/2024/pa}/2021020029.csv +0 -0
- {pa β data/2024/pa}/2021020030.csv +0 -0
- {pa β data/2024/pa}/2021020031.csv +0 -0
- {pa β data/2024/pa}/2021020032.csv +0 -0
- {pa β data/2024/pa}/2021020033.csv +0 -0
- {pa β data/2024/pa}/2021020034.csv +0 -0
- {pa β data/2024/pa}/2021020035.csv +0 -0
- {pa β data/2024/pa}/2021020036.csv +0 -0
- {pa β data/2024/pa}/2021020037.csv +0 -0
- {pa β data/2024/pa}/2021020038.csv +0 -0
- {pa β data/2024/pa}/2021020039.csv +0 -0
- {pa β data/2024/pa}/2021020040.csv +0 -0
- {pa β data/2024/pa}/2021020041.csv +0 -0
- {pa β data/2024/pa}/2021020042.csv +0 -0
- {pa β data/2024/pa}/2021020043.csv +0 -0
- {pa β data/2024/pa}/2021020044.csv +0 -0
- {pa β data/2024/pa}/2021020045.csv +0 -0
- {pa β data/2024/pa}/2021020046.csv +0 -0
- {pa β data/2024/pa}/2021020047.csv +0 -0
data.py
CHANGED
@@ -14,26 +14,6 @@ from translate import (
|
|
14 |
max_pitch_types
|
15 |
)
|
16 |
|
17 |
-
# load game data
|
18 |
-
game_df = pl.read_csv('game.csv').unique()
|
19 |
-
assert len(game_df) == len(game_df['game_pk'].unique())
|
20 |
-
|
21 |
-
# load pa data
|
22 |
-
pa_df = []
|
23 |
-
for game_pk in tqdm(game_df['game_pk']):
|
24 |
-
pa_df.append(pl.read_csv(os.path.join('pa', f'{game_pk}.csv'), schema_overrides={'pa_pk': str}))
|
25 |
-
pa_df = pl.concat(pa_df)
|
26 |
-
|
27 |
-
# load pitch data
|
28 |
-
pitch_df = []
|
29 |
-
for game_pk in tqdm(game_df['game_pk']):
|
30 |
-
pitch_df.append(pl.read_csv(os.path.join('pitch', f'{game_pk}.csv'), schema_overrides={'pa_pk': str, 'on_1b': pl.Int64, 'on_2b': pl.Int64, 'on_3b': pl.Int64}))
|
31 |
-
pitch_df = pl.concat(pitch_df)
|
32 |
-
|
33 |
-
# load player data
|
34 |
-
player_df = pl.read_csv('player.csv')
|
35 |
-
|
36 |
-
# translate pa data
|
37 |
|
38 |
def identify_bb_type(hit_type):
|
39 |
if hit_type in list(range(1, 10)) + list(range(40, 49)):
|
@@ -49,115 +29,160 @@ def identify_bb_type(hit_type):
|
|
49 |
else:
|
50 |
raise Exception(f'Unexpect hit_type {hit_type}')
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
.
|
60 |
-
|
61 |
-
|
62 |
-
.
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
)
|
67 |
-
.
|
68 |
-
|
69 |
-
.alias('des')
|
70 |
-
)
|
71 |
-
.with_columns(
|
72 |
-
pl.when(
|
73 |
-
pl.col('des').is_in(['γγΌγ«', 'θ¦ιγ', 'η©Ίζ―γ']) |
|
74 |
-
pl.col('des').str.ends_with('ε‘γγεΆ')
|
75 |
)
|
76 |
-
.
|
77 |
-
pl.
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
)
|
79 |
-
.
|
80 |
-
pl.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
)
|
82 |
-
.alias('des')
|
83 |
-
)
|
84 |
-
.with_columns(
|
85 |
-
pl.col('des').map_elements(translate_pa_outcome, return_dtype=str)
|
86 |
-
)
|
87 |
-
.with_columns(
|
88 |
-
pl.col('bb_type').alias('hit_type').str.strip_prefix('dakyu').cast(int).alias('hit_type')
|
89 |
-
)
|
90 |
-
.with_columns(
|
91 |
-
pl.col('hit_type').map_elements(lambda hit_type: identify_bb_type(hit_type), return_dtype=str).alias('bb_type')
|
92 |
)
|
93 |
-
)
|
94 |
|
95 |
-
# translate pitch data
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
)
|
102 |
-
.with_columns(
|
103 |
-
pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_en_pitch[pitch_name], return_dtype=str).alias('pitch_name'),
|
104 |
-
# pl.col('jp_pitch_name').replace_strict(jp_pitch_to_en_pitch).alias('pitch_name'),
|
105 |
-
pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_pitch_code[pitch_name], return_dtype=str).alias('pitch_type'),
|
106 |
-
# pl.col('jp_pitch_name').map_elements(jp_pitch_to_pitch_code).alias('pitch_type'),
|
107 |
-
pl.col('description').str.split(' ').list.first().map_elements(translate_pitch_outcome, return_dtype=str),
|
108 |
-
pl.when(
|
109 |
-
pl.col('release_speed') != '-'
|
110 |
)
|
111 |
-
.
|
112 |
-
pl.col('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
)
|
114 |
-
.
|
115 |
-
|
116 |
)
|
117 |
-
.alias('release_speed'),
|
118 |
-
((pl.col('plate_x') + 13) - 80).alias('plate_x'),
|
119 |
-
(200 - (pl.col('plate_z') + 13) - 100).alias('plate_z'),
|
120 |
)
|
121 |
-
.with_columns(
|
122 |
-
pl.col('release_speed').cast(int), # idk why I can't do this during the strip_suffix step
|
123 |
-
)
|
124 |
-
)
|
125 |
|
126 |
-
# translate player data
|
127 |
-
|
128 |
-
register
|
129 |
-
|
130 |
-
|
131 |
-
pl.col('en_name').str.replace(',', '').alias('en_name'),
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
)
|
134 |
-
.
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
-
#
|
|
|
|
|
143 |
|
144 |
-
|
145 |
-
(
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
)
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
157 |
|
158 |
-
|
159 |
-
# unfortunately we have pas that don't show up in the pitch data, so this would be useful for
|
160 |
-
pa_df = pa_df.join(player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
|
161 |
|
162 |
# pitch_stats, rhb_pitch_stats, lhb_pitch_stats = [
|
163 |
# (
|
|
|
14 |
max_pitch_types
|
15 |
)
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def identify_bb_type(hit_type):
|
19 |
if hit_type in list(range(1, 10)) + list(range(40, 49)):
|
|
|
29 |
else:
|
30 |
raise Exception(f'Unexpect hit_type {hit_type}')
|
31 |
|
32 |
+
|
33 |
+
DATA_DIR = 'data'
|
34 |
+
SEASONS = sorted([folder for folder in os.listdir(DATA_DIR) if not folder.startswith('.')])
|
35 |
+
|
36 |
+
game_df, pa_df, pitch_df, player_df, df = [], [], [], [], []
|
37 |
+
|
38 |
+
for season in SEASONS:
|
39 |
+
season_dir = os.path.join(DATA_DIR, season)
|
40 |
+
|
41 |
+
# load game data
|
42 |
+
_game_df = pl.read_csv(os.path.join(season_dir, 'game.csv')).unique()
|
43 |
+
assert len(_game_df) == len(_game_df['game_pk'].unique())
|
44 |
+
|
45 |
+
# load pa data
|
46 |
+
_pa_df = []
|
47 |
+
for game_pk in tqdm(_game_df['game_pk']):
|
48 |
+
_pa_df.append(pl.read_csv(os.path.join(season_dir, 'pa', f'{game_pk}.csv'), schema_overrides={'pa_pk': str}))
|
49 |
+
_pa_df = pl.concat(_pa_df)
|
50 |
+
|
51 |
+
# load pitch data
|
52 |
+
_pitch_df = []
|
53 |
+
for game_pk in tqdm(_game_df['game_pk']):
|
54 |
+
_pitch_df.append(pl.read_csv(os.path.join(season_dir, 'pitch', f'{game_pk}.csv'), schema_overrides={'pa_pk': str, 'on_1b': pl.Int64, 'on_2b': pl.Int64, 'on_3b': pl.Int64}))
|
55 |
+
_pitch_df = pl.concat(_pitch_df)
|
56 |
+
|
57 |
+
# load player data
|
58 |
+
_player_df = pl.read_csv(os.path.join(season_dir, 'player.csv'))
|
59 |
+
|
60 |
+
# translate pa data
|
61 |
+
_pa_df = (
|
62 |
+
_pa_df
|
63 |
+
.with_columns(
|
64 |
+
pl.col('des').str.strip_chars().alias('_des'),
|
65 |
+
pl.col('des').str.strip_chars(),
|
66 |
+
pl.col('des_more').str.strip_chars()
|
67 |
)
|
68 |
+
.with_columns(
|
69 |
+
pl.col('des').fill_null(pl.col('des_more'))
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
)
|
71 |
+
.with_columns(
|
72 |
+
pl.when(
|
73 |
+
(pl.col('des').str.split(' ').list.len() > 1) &
|
74 |
+
(pl.col('des').str.contains(r'οΌ\d+ηΉ'))
|
75 |
+
)
|
76 |
+
.then(pl.col('des').str.split(' ').list.first())
|
77 |
+
.otherwise(pl.col('des'))
|
78 |
+
.alias('des')
|
79 |
)
|
80 |
+
.with_columns(
|
81 |
+
pl.when(
|
82 |
+
pl.col('des').is_in(['γγΌγ«', 'θ¦ιγ', 'η©Ίζ―γ']) |
|
83 |
+
pl.col('des').str.ends_with('ε‘γγεΆ')
|
84 |
+
)
|
85 |
+
.then(
|
86 |
+
pl.col('des_more')
|
87 |
+
)
|
88 |
+
.otherwise(
|
89 |
+
pl.col('des')
|
90 |
+
)
|
91 |
+
.alias('des')
|
92 |
+
)
|
93 |
+
.with_columns(
|
94 |
+
pl.col('des').map_elements(translate_pa_outcome, return_dtype=str)
|
95 |
+
)
|
96 |
+
.with_columns(
|
97 |
+
pl.col('bb_type').alias('hit_type').str.strip_prefix('dakyu').cast(int).alias('hit_type')
|
98 |
+
)
|
99 |
+
.with_columns(
|
100 |
+
pl.col('hit_type').map_elements(lambda hit_type: identify_bb_type(hit_type), return_dtype=str).alias('bb_type')
|
101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
)
|
|
|
103 |
|
104 |
+
# translate pitch data
|
105 |
+
_pitch_df = (
|
106 |
+
_pitch_df
|
107 |
+
.filter(pl.col('pitch_name').is_not_null())
|
108 |
+
.with_columns(
|
109 |
+
pl.col('pitch_name').alias('jp_pitch_name')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
)
|
111 |
+
.with_columns(
|
112 |
+
pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_en_pitch[pitch_name], return_dtype=str).alias('pitch_name'),
|
113 |
+
# pl.col('jp_pitch_name').replace_strict(jp_pitch_to_en_pitch).alias('pitch_name'),
|
114 |
+
pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_pitch_code[pitch_name], return_dtype=str).alias('pitch_type'),
|
115 |
+
# pl.col('jp_pitch_name').map_elements(jp_pitch_to_pitch_code).alias('pitch_type'),
|
116 |
+
pl.col('description').str.split(' ').list.first().map_elements(translate_pitch_outcome, return_dtype=str),
|
117 |
+
pl.when(
|
118 |
+
pl.col('release_speed') != '-'
|
119 |
+
)
|
120 |
+
.then(
|
121 |
+
pl.col('release_speed').str.strip_suffix('km/h')
|
122 |
+
)
|
123 |
+
.otherwise(
|
124 |
+
None
|
125 |
+
)
|
126 |
+
.alias('release_speed'),
|
127 |
+
((pl.col('plate_x') + 13) - 80).alias('plate_x'),
|
128 |
+
(200 - (pl.col('plate_z') + 13) - 100).alias('plate_z'),
|
129 |
)
|
130 |
+
.with_columns(
|
131 |
+
pl.col('release_speed').cast(int), # idk why I can't do this during the strip_suffix step
|
132 |
)
|
|
|
|
|
|
|
133 |
)
|
|
|
|
|
|
|
|
|
134 |
|
135 |
+
# translate player data
|
136 |
+
register = (
|
137 |
+
pl.read_csv(os.path.join(season_dir, 'register.csv'))
|
138 |
+
.with_columns(
|
139 |
+
pl.col('en_name').str.replace(',', '').alias('en_name'),
|
|
|
140 |
|
141 |
+
)
|
142 |
+
.select(
|
143 |
+
pl.col('en_name'),
|
144 |
+
pl.col('jp_team').alias('team'),
|
145 |
+
pl.col('jp_name').alias('name')
|
146 |
+
)
|
147 |
)
|
148 |
+
_player_df = _player_df.join(register, on=['name', 'team'], how='inner').with_columns(pl.col('en_name').alias('name')).drop(pl.col('en_name'))
|
149 |
+
|
150 |
+
# merge pitch and pa data
|
151 |
+
_df = (
|
152 |
+
(
|
153 |
+
_pitch_df
|
154 |
+
.join(_pa_df, on=['game_pk', 'pa_pk'], how='inner')
|
155 |
+
.join(_player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
|
156 |
+
)
|
157 |
+
.with_columns(
|
158 |
+
pl.col('description').is_in(['SS', 'K']).alias('whiff'),
|
159 |
+
~pl.col('description').is_in(['B', 'BB', 'LS', 'inv_K', 'bunt_K', 'HBP', 'SH', 'SH E', 'SH FC', 'obstruction', 'illegal_pitch', 'defensive_interference']).alias('swing'),
|
160 |
+
pl.col('description').is_in(['SS', 'K', 'LS', 'inv_K']).alias('csw'),
|
161 |
+
~pl.col('description').is_in(['obstruction', 'illegal_pitch', 'defensive_interference']).alias('normal_pitch') # guess
|
162 |
+
)
|
163 |
+
).sort(['game_pk', 'pa_pk', 'pitch_id'])
|
164 |
|
165 |
+
# add players to pa_df
|
166 |
+
# unfortunately we have pas that don't show up in the pitch data, so this would be useful for
|
167 |
+
_pa_df = _pa_df.join(_player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
|
168 |
|
169 |
+
# add season dfs to main dfs
|
170 |
+
game_df.append(_game_df)
|
171 |
+
pa_df.append(_pa_df)
|
172 |
+
pitch_df.append(_pitch_df)
|
173 |
+
player_df.append(_player_df)
|
174 |
+
df.append(_df)
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
# combine all season dfs
|
179 |
+
game_df = pl.concat(game_df)
|
180 |
+
pa_df = pl.concat(pa_df)
|
181 |
+
pitch_df = pl.concat(pitch_df)
|
182 |
+
player_df = pl.concat(player_df).unique()
|
183 |
+
df = pl.concat(df)
|
184 |
|
185 |
+
assert len(_game_df) == len(_game_df['game_pk'].unique())
|
|
|
|
|
186 |
|
187 |
# pitch_stats, rhb_pitch_stats, lhb_pitch_stats = [
|
188 |
# (
|
game.csv β data/2024/game.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021019999.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020000.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020001.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020002.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020003.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020004.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020005.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020006.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020007.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020008.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020009.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020010.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020011.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020012.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020013.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020014.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020015.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020016.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020017.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020018.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020019.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020020.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020021.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020022.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020023.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020024.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020026.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020027.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020028.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020029.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020030.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020031.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020032.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020033.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020034.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020035.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020036.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020037.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020038.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020039.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020040.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020041.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020042.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020043.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020044.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020045.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020046.csv
RENAMED
File without changes
|
{pa β data/2024/pa}/2021020047.csv
RENAMED
File without changes
|