patrickramos commited on
Commit
4871b85
Β·
1 Parent(s): f1f0527

Reorganize data

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. data.py +138 -113
  2. game.csv β†’ data/2024/game.csv +0 -0
  3. {pa β†’ data/2024/pa}/2021019999.csv +0 -0
  4. {pa β†’ data/2024/pa}/2021020000.csv +0 -0
  5. {pa β†’ data/2024/pa}/2021020001.csv +0 -0
  6. {pa β†’ data/2024/pa}/2021020002.csv +0 -0
  7. {pa β†’ data/2024/pa}/2021020003.csv +0 -0
  8. {pa β†’ data/2024/pa}/2021020004.csv +0 -0
  9. {pa β†’ data/2024/pa}/2021020005.csv +0 -0
  10. {pa β†’ data/2024/pa}/2021020006.csv +0 -0
  11. {pa β†’ data/2024/pa}/2021020007.csv +0 -0
  12. {pa β†’ data/2024/pa}/2021020008.csv +0 -0
  13. {pa β†’ data/2024/pa}/2021020009.csv +0 -0
  14. {pa β†’ data/2024/pa}/2021020010.csv +0 -0
  15. {pa β†’ data/2024/pa}/2021020011.csv +0 -0
  16. {pa β†’ data/2024/pa}/2021020012.csv +0 -0
  17. {pa β†’ data/2024/pa}/2021020013.csv +0 -0
  18. {pa β†’ data/2024/pa}/2021020014.csv +0 -0
  19. {pa β†’ data/2024/pa}/2021020015.csv +0 -0
  20. {pa β†’ data/2024/pa}/2021020016.csv +0 -0
  21. {pa β†’ data/2024/pa}/2021020017.csv +0 -0
  22. {pa β†’ data/2024/pa}/2021020018.csv +0 -0
  23. {pa β†’ data/2024/pa}/2021020019.csv +0 -0
  24. {pa β†’ data/2024/pa}/2021020020.csv +0 -0
  25. {pa β†’ data/2024/pa}/2021020021.csv +0 -0
  26. {pa β†’ data/2024/pa}/2021020022.csv +0 -0
  27. {pa β†’ data/2024/pa}/2021020023.csv +0 -0
  28. {pa β†’ data/2024/pa}/2021020024.csv +0 -0
  29. {pa β†’ data/2024/pa}/2021020026.csv +0 -0
  30. {pa β†’ data/2024/pa}/2021020027.csv +0 -0
  31. {pa β†’ data/2024/pa}/2021020028.csv +0 -0
  32. {pa β†’ data/2024/pa}/2021020029.csv +0 -0
  33. {pa β†’ data/2024/pa}/2021020030.csv +0 -0
  34. {pa β†’ data/2024/pa}/2021020031.csv +0 -0
  35. {pa β†’ data/2024/pa}/2021020032.csv +0 -0
  36. {pa β†’ data/2024/pa}/2021020033.csv +0 -0
  37. {pa β†’ data/2024/pa}/2021020034.csv +0 -0
  38. {pa β†’ data/2024/pa}/2021020035.csv +0 -0
  39. {pa β†’ data/2024/pa}/2021020036.csv +0 -0
  40. {pa β†’ data/2024/pa}/2021020037.csv +0 -0
  41. {pa β†’ data/2024/pa}/2021020038.csv +0 -0
  42. {pa β†’ data/2024/pa}/2021020039.csv +0 -0
  43. {pa β†’ data/2024/pa}/2021020040.csv +0 -0
  44. {pa β†’ data/2024/pa}/2021020041.csv +0 -0
  45. {pa β†’ data/2024/pa}/2021020042.csv +0 -0
  46. {pa β†’ data/2024/pa}/2021020043.csv +0 -0
  47. {pa β†’ data/2024/pa}/2021020044.csv +0 -0
  48. {pa β†’ data/2024/pa}/2021020045.csv +0 -0
  49. {pa β†’ data/2024/pa}/2021020046.csv +0 -0
  50. {pa β†’ data/2024/pa}/2021020047.csv +0 -0
data.py CHANGED
@@ -14,26 +14,6 @@ from translate import (
14
  max_pitch_types
15
  )
16
 
17
- # load game data
18
- game_df = pl.read_csv('game.csv').unique()
19
- assert len(game_df) == len(game_df['game_pk'].unique())
20
-
21
- # load pa data
22
- pa_df = []
23
- for game_pk in tqdm(game_df['game_pk']):
24
- pa_df.append(pl.read_csv(os.path.join('pa', f'{game_pk}.csv'), schema_overrides={'pa_pk': str}))
25
- pa_df = pl.concat(pa_df)
26
-
27
- # load pitch data
28
- pitch_df = []
29
- for game_pk in tqdm(game_df['game_pk']):
30
- pitch_df.append(pl.read_csv(os.path.join('pitch', f'{game_pk}.csv'), schema_overrides={'pa_pk': str, 'on_1b': pl.Int64, 'on_2b': pl.Int64, 'on_3b': pl.Int64}))
31
- pitch_df = pl.concat(pitch_df)
32
-
33
- # load player data
34
- player_df = pl.read_csv('player.csv')
35
-
36
- # translate pa data
37
 
38
  def identify_bb_type(hit_type):
39
  if hit_type in list(range(1, 10)) + list(range(40, 49)):
@@ -49,115 +29,160 @@ def identify_bb_type(hit_type):
49
  else:
50
  raise Exception(f'Unexpect hit_type {hit_type}')
51
 
52
- pa_df = (
53
- pa_df
54
- .with_columns(
55
- pl.col('des').str.strip_chars().alias('_des'),
56
- pl.col('des').str.strip_chars(),
57
- pl.col('des_more').str.strip_chars()
58
- )
59
- .with_columns(
60
- pl.col('des').fill_null(pl.col('des_more'))
61
- )
62
- .with_columns(
63
- pl.when(
64
- (pl.col('des').str.split(' ').list.len() > 1) &
65
- (pl.col('des').str.contains(r'οΌ‹\d+η‚Ή'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
- .then(pl.col('des').str.split(' ').list.first())
68
- .otherwise(pl.col('des'))
69
- .alias('des')
70
- )
71
- .with_columns(
72
- pl.when(
73
- pl.col('des').is_in(['γƒœγƒΌγƒ«', '見逃し', 'η©ΊζŒ―γ‚Š']) |
74
- pl.col('des').str.ends_with('ε‘γ‘γ‚“εˆΆ')
75
  )
76
- .then(
77
- pl.col('des_more')
 
 
 
 
 
 
78
  )
79
- .otherwise(
80
- pl.col('des')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  )
82
- .alias('des')
83
- )
84
- .with_columns(
85
- pl.col('des').map_elements(translate_pa_outcome, return_dtype=str)
86
- )
87
- .with_columns(
88
- pl.col('bb_type').alias('hit_type').str.strip_prefix('dakyu').cast(int).alias('hit_type')
89
- )
90
- .with_columns(
91
- pl.col('hit_type').map_elements(lambda hit_type: identify_bb_type(hit_type), return_dtype=str).alias('bb_type')
92
  )
93
- )
94
 
95
- # translate pitch data
96
- pitch_df = (
97
- pitch_df
98
- .filter(pl.col('pitch_name').is_not_null())
99
- .with_columns(
100
- pl.col('pitch_name').alias('jp_pitch_name')
101
- )
102
- .with_columns(
103
- pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_en_pitch[pitch_name], return_dtype=str).alias('pitch_name'),
104
- # pl.col('jp_pitch_name').replace_strict(jp_pitch_to_en_pitch).alias('pitch_name'),
105
- pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_pitch_code[pitch_name], return_dtype=str).alias('pitch_type'),
106
- # pl.col('jp_pitch_name').map_elements(jp_pitch_to_pitch_code).alias('pitch_type'),
107
- pl.col('description').str.split(' ').list.first().map_elements(translate_pitch_outcome, return_dtype=str),
108
- pl.when(
109
- pl.col('release_speed') != '-'
110
  )
111
- .then(
112
- pl.col('release_speed').str.strip_suffix('km/h')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  )
114
- .otherwise(
115
- None
116
  )
117
- .alias('release_speed'),
118
- ((pl.col('plate_x') + 13) - 80).alias('plate_x'),
119
- (200 - (pl.col('plate_z') + 13) - 100).alias('plate_z'),
120
  )
121
- .with_columns(
122
- pl.col('release_speed').cast(int), # idk why I can't do this during the strip_suffix step
123
- )
124
- )
125
 
126
- # translate player data
127
- player_df = pl.read_csv('player.csv')
128
- register = (
129
- pl.read_csv('register.csv')
130
- .with_columns(
131
- pl.col('en_name').str.replace(',', '').alias('en_name'),
132
 
 
 
 
 
 
 
133
  )
134
- .select(
135
- pl.col('en_name'),
136
- pl.col('jp_team').alias('team'),
137
- pl.col('jp_name').alias('name')
138
- )
139
- )
140
- player_df = player_df.join(register, on=['name', 'team'], how='inner').with_columns(pl.col('en_name').alias('name')).drop(pl.col('en_name'))
 
 
 
 
 
 
 
 
 
141
 
142
- # merge pitch and pa data
 
 
143
 
144
- df = (
145
- (
146
- pitch_df
147
- .join(pa_df, on=['game_pk', 'pa_pk'], how='inner')
148
- .join(player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
149
- )
150
- .with_columns(
151
- pl.col('description').is_in(['SS', 'K']).alias('whiff'),
152
- ~pl.col('description').is_in(['B', 'BB', 'LS', 'inv_K', 'bunt_K', 'HBP', 'SH', 'SH E', 'SH FC', 'obstruction', 'illegal_pitch', 'defensive_interference']).alias('swing'),
153
- pl.col('description').is_in(['SS', 'K', 'LS', 'inv_K']).alias('csw'),
154
- ~pl.col('description').is_in(['obstruction', 'illegal_pitch', 'defensive_interference']).alias('normal_pitch') # guess
155
- )
156
- ).sort(['game_pk', 'pa_pk', 'pitch_id'])
 
 
157
 
158
- # add players to pa_df
159
- # unfortunately we have pas that don't show up in the pitch data, so this would be useful for
160
- pa_df = pa_df.join(player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
161
 
162
  # pitch_stats, rhb_pitch_stats, lhb_pitch_stats = [
163
  # (
 
14
  max_pitch_types
15
  )
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def identify_bb_type(hit_type):
19
  if hit_type in list(range(1, 10)) + list(range(40, 49)):
 
29
  else:
30
  raise Exception(f'Unexpect hit_type {hit_type}')
31
 
32
+
33
+ DATA_DIR = 'data'
34
+ SEASONS = sorted([folder for folder in os.listdir(DATA_DIR) if not folder.startswith('.')])
35
+
36
+ game_df, pa_df, pitch_df, player_df, df = [], [], [], [], []
37
+
38
+ for season in SEASONS:
39
+ season_dir = os.path.join(DATA_DIR, season)
40
+
41
+ # load game data
42
+ _game_df = pl.read_csv(os.path.join(season_dir, 'game.csv')).unique()
43
+ assert len(_game_df) == len(_game_df['game_pk'].unique())
44
+
45
+ # load pa data
46
+ _pa_df = []
47
+ for game_pk in tqdm(_game_df['game_pk']):
48
+ _pa_df.append(pl.read_csv(os.path.join(season_dir, 'pa', f'{game_pk}.csv'), schema_overrides={'pa_pk': str}))
49
+ _pa_df = pl.concat(_pa_df)
50
+
51
+ # load pitch data
52
+ _pitch_df = []
53
+ for game_pk in tqdm(_game_df['game_pk']):
54
+ _pitch_df.append(pl.read_csv(os.path.join(season_dir, 'pitch', f'{game_pk}.csv'), schema_overrides={'pa_pk': str, 'on_1b': pl.Int64, 'on_2b': pl.Int64, 'on_3b': pl.Int64}))
55
+ _pitch_df = pl.concat(_pitch_df)
56
+
57
+ # load player data
58
+ _player_df = pl.read_csv(os.path.join(season_dir, 'player.csv'))
59
+
60
+ # translate pa data
61
+ _pa_df = (
62
+ _pa_df
63
+ .with_columns(
64
+ pl.col('des').str.strip_chars().alias('_des'),
65
+ pl.col('des').str.strip_chars(),
66
+ pl.col('des_more').str.strip_chars()
67
  )
68
+ .with_columns(
69
+ pl.col('des').fill_null(pl.col('des_more'))
 
 
 
 
 
 
70
  )
71
+ .with_columns(
72
+ pl.when(
73
+ (pl.col('des').str.split(' ').list.len() > 1) &
74
+ (pl.col('des').str.contains(r'οΌ‹\d+η‚Ή'))
75
+ )
76
+ .then(pl.col('des').str.split(' ').list.first())
77
+ .otherwise(pl.col('des'))
78
+ .alias('des')
79
  )
80
+ .with_columns(
81
+ pl.when(
82
+ pl.col('des').is_in(['γƒœγƒΌγƒ«', '見逃し', 'η©ΊζŒ―γ‚Š']) |
83
+ pl.col('des').str.ends_with('ε‘γ‘γ‚“εˆΆ')
84
+ )
85
+ .then(
86
+ pl.col('des_more')
87
+ )
88
+ .otherwise(
89
+ pl.col('des')
90
+ )
91
+ .alias('des')
92
+ )
93
+ .with_columns(
94
+ pl.col('des').map_elements(translate_pa_outcome, return_dtype=str)
95
+ )
96
+ .with_columns(
97
+ pl.col('bb_type').alias('hit_type').str.strip_prefix('dakyu').cast(int).alias('hit_type')
98
+ )
99
+ .with_columns(
100
+ pl.col('hit_type').map_elements(lambda hit_type: identify_bb_type(hit_type), return_dtype=str).alias('bb_type')
101
  )
 
 
 
 
 
 
 
 
 
 
102
  )
 
103
 
104
+ # translate pitch data
105
+ _pitch_df = (
106
+ _pitch_df
107
+ .filter(pl.col('pitch_name').is_not_null())
108
+ .with_columns(
109
+ pl.col('pitch_name').alias('jp_pitch_name')
 
 
 
 
 
 
 
 
 
110
  )
111
+ .with_columns(
112
+ pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_en_pitch[pitch_name], return_dtype=str).alias('pitch_name'),
113
+ # pl.col('jp_pitch_name').replace_strict(jp_pitch_to_en_pitch).alias('pitch_name'),
114
+ pl.col('jp_pitch_name').map_elements(lambda pitch_name: jp_pitch_to_pitch_code[pitch_name], return_dtype=str).alias('pitch_type'),
115
+ # pl.col('jp_pitch_name').map_elements(jp_pitch_to_pitch_code).alias('pitch_type'),
116
+ pl.col('description').str.split(' ').list.first().map_elements(translate_pitch_outcome, return_dtype=str),
117
+ pl.when(
118
+ pl.col('release_speed') != '-'
119
+ )
120
+ .then(
121
+ pl.col('release_speed').str.strip_suffix('km/h')
122
+ )
123
+ .otherwise(
124
+ None
125
+ )
126
+ .alias('release_speed'),
127
+ ((pl.col('plate_x') + 13) - 80).alias('plate_x'),
128
+ (200 - (pl.col('plate_z') + 13) - 100).alias('plate_z'),
129
  )
130
+ .with_columns(
131
+ pl.col('release_speed').cast(int), # idk why I can't do this during the strip_suffix step
132
  )
 
 
 
133
  )
 
 
 
 
134
 
135
+ # translate player data
136
+ register = (
137
+ pl.read_csv(os.path.join(season_dir, 'register.csv'))
138
+ .with_columns(
139
+ pl.col('en_name').str.replace(',', '').alias('en_name'),
 
140
 
141
+ )
142
+ .select(
143
+ pl.col('en_name'),
144
+ pl.col('jp_team').alias('team'),
145
+ pl.col('jp_name').alias('name')
146
+ )
147
  )
148
+ _player_df = _player_df.join(register, on=['name', 'team'], how='inner').with_columns(pl.col('en_name').alias('name')).drop(pl.col('en_name'))
149
+
150
+ # merge pitch and pa data
151
+ _df = (
152
+ (
153
+ _pitch_df
154
+ .join(_pa_df, on=['game_pk', 'pa_pk'], how='inner')
155
+ .join(_player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
156
+ )
157
+ .with_columns(
158
+ pl.col('description').is_in(['SS', 'K']).alias('whiff'),
159
+ ~pl.col('description').is_in(['B', 'BB', 'LS', 'inv_K', 'bunt_K', 'HBP', 'SH', 'SH E', 'SH FC', 'obstruction', 'illegal_pitch', 'defensive_interference']).alias('swing'),
160
+ pl.col('description').is_in(['SS', 'K', 'LS', 'inv_K']).alias('csw'),
161
+ ~pl.col('description').is_in(['obstruction', 'illegal_pitch', 'defensive_interference']).alias('normal_pitch') # guess
162
+ )
163
+ ).sort(['game_pk', 'pa_pk', 'pitch_id'])
164
 
165
+ # add players to pa_df
166
+ # unfortunately we have pas that don't show up in the pitch data, so this would be useful for
167
+ _pa_df = _pa_df.join(_player_df.rename({'player_id': 'pitcher'}), on='pitcher', how='inner')
168
 
169
+ # add season dfs to main dfs
170
+ game_df.append(_game_df)
171
+ pa_df.append(_pa_df)
172
+ pitch_df.append(_pitch_df)
173
+ player_df.append(_player_df)
174
+ df.append(_df)
175
+
176
+
177
+
178
+ # combine all season dfs
179
+ game_df = pl.concat(game_df)
180
+ pa_df = pl.concat(pa_df)
181
+ pitch_df = pl.concat(pitch_df)
182
+ player_df = pl.concat(player_df).unique()
183
+ df = pl.concat(df)
184
 
185
+ assert len(_game_df) == len(_game_df['game_pk'].unique())
 
 
186
 
187
  # pitch_stats, rhb_pitch_stats, lhb_pitch_stats = [
188
  # (
game.csv β†’ data/2024/game.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021019999.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020000.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020001.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020002.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020003.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020004.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020005.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020006.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020007.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020008.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020009.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020010.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020011.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020012.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020013.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020014.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020015.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020016.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020017.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020018.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020019.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020020.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020021.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020022.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020023.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020024.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020026.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020027.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020028.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020029.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020030.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020031.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020032.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020033.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020034.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020035.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020036.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020037.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020038.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020039.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020040.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020041.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020042.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020043.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020044.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020045.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020046.csv RENAMED
File without changes
{pa β†’ data/2024/pa}/2021020047.csv RENAMED
File without changes