cyberosa
cleaning, new notebooks and two months data logic
99c38a1
raw
history blame
3.35 kB
import sys
import os
import time
from tqdm import tqdm
from tools import MechEventName
from typing import List
import pandas as pd
import gc
from pathlib import Path
REDUCE_FACTOR = 0.25
SLEEP = 0.5
REQUEST_ID_FIELD = "request_id"
SCRIPTS_DIR = Path(__file__).parent
ROOT_DIR = SCRIPTS_DIR.parent
DATA_DIR = ROOT_DIR / "data"
BLOCK_FIELD = "block"
def parse_args() -> str:
"""Parse the arguments and return the RPC."""
if len(sys.argv) != 2:
raise ValueError("Expected the RPC as a positional argument.")
return sys.argv[1]
def read_abi(abi_path: str) -> str:
"""Read and return the wxDAI contract's ABI."""
with open(abi_path) as abi_file:
return abi_file.read()
def reduce_window(contract_instance, event, from_block, batch_size, latest_block):
"""Dynamically reduce the batch size window."""
keep_fraction = 1 - REDUCE_FACTOR
events_filter = contract_instance.events[event].build_filter()
events_filter.fromBlock = from_block
batch_size = int(batch_size * keep_fraction)
events_filter.toBlock = min(from_block + batch_size, latest_block)
tqdm.write(f"RPC timed out! Resizing batch size to {batch_size}.")
time.sleep(SLEEP)
return events_filter, batch_size
def limit_text(text: str, limit: int = 200) -> str:
"""Limit the given text"""
if len(text) > limit:
return f"{text[:limit]}..."
return text
def check_for_dicts(df: pd.DataFrame) -> List[str]:
"""Check for columns that contain dictionaries."""
dict_columns = []
for column in df.columns:
if df[column].apply(lambda x: isinstance(x, dict)).any():
dict_columns.append(column)
return dict_columns
def drop_dict_rows(df: pd.DataFrame, dict_columns: List[str]) -> pd.DataFrame:
"""Drop rows that contain dictionaries."""
for column in dict_columns:
df = df[~df[column].apply(lambda x: isinstance(x, dict))]
return df
def clean(df: pd.DataFrame) -> pd.DataFrame:
"""Clean the dataframe."""
dict_columns = check_for_dicts(df)
df = drop_dict_rows(df, dict_columns)
cleaned = df.drop_duplicates()
cleaned[REQUEST_ID_FIELD] = cleaned[REQUEST_ID_FIELD].astype("str")
return cleaned
def gen_event_filename(event_name: MechEventName) -> str:
"""Generate the filename of an event."""
return f"{event_name.value.lower()}s.parquet"
def read_n_last_lines(filename: str, n: int = 1) -> str:
"""Return the `n` last lines' content of a file."""
num_newlines = 0
with open(filename, "rb") as f:
try:
f.seek(-2, os.SEEK_END)
while num_newlines < n:
f.seek(-2, os.SEEK_CUR)
if f.read(1) == b"\n":
num_newlines += 1
except OSError:
f.seek(0)
last_line = f.readline().decode()
return last_line
def get_earliest_block(event_name: MechEventName) -> int:
"""Get the earliest block number to use when filtering for events."""
filename = gen_event_filename(event_name)
if not os.path.exists(DATA_DIR / filename):
return 0
df = pd.read_parquet(DATA_DIR / filename)
block_field = f"{event_name.value.lower()}_{BLOCK_FIELD}"
earliest_block = int(df[block_field].max())
# clean and release all memory
del df
gc.collect()
return earliest_block