File size: 2,424 Bytes
edc06cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import hashlib
import io
from typing import Any

import numpy as np
import soundfile as sf
from fastapi.encoders import jsonable_encoder


def round_floats(value: Any, round_value: int) -> Any:
    """floatの小数点以下を再帰的に丸める"""
    if isinstance(value, float):
        return round(value, round_value)
    elif isinstance(value, np.ndarray) and np.issubdtype(value.dtype, np.floating):
        return np.round(value, round_value)
    elif isinstance(value, list):
        return [round_floats(v, round_value) for v in value]
    elif isinstance(value, dict):
        return {k: round_floats(v, round_value) for k, v in value.items()}
    else:
        return value


def pydantic_to_native_type(value: Any) -> Any:
    """pydanticの型をnativeな型に変換する"""
    return jsonable_encoder(value)


def hash_long_string(value: Any) -> Any:
    """文字数が1000文字を超えるものはハッシュ化する"""

    def to_hash(value: str) -> str:
        return "MD5:" + hashlib.md5(value.encode()).hexdigest()

    if isinstance(value, str):
        return value if len(value) <= 1000 else to_hash(value)
    elif isinstance(value, list):
        return [hash_long_string(v) for v in value]
    elif isinstance(value, dict):
        return {k: hash_long_string(v) for k, v in value.items()}
    else:
        return value


def summarize_big_ndarray(value: Any) -> Any:
    """要素数が100を超える NDArray を、ハッシュ値と shape からなる文字列へ要約する"""

    def to_hash(value: np.ndarray) -> str:
        return "MD5:" + hashlib.md5(value.tobytes()).hexdigest()

    if isinstance(value, np.ndarray):
        if value.size <= 100:
            return value
        else:
            return {"hash": to_hash(value), "shape": value.shape}
    elif isinstance(value, list):
        return [summarize_big_ndarray(v) for v in value]
    elif isinstance(value, dict):
        return {k: summarize_big_ndarray(v) for k, v in value.items()}
    else:
        return value


def hash_wave_floats_from_wav_bytes(wav_bytes: bytes) -> str:
    """.wavファイルバイト列から音声波形を抽出しハッシュ化する"""
    wave = sf.read(io.BytesIO(wav_bytes))[0].tolist()
    # NOTE: Linux-Windows 数値精度問題に対するワークアラウンド
    wave = round_floats(wave, 2)
    return "MD5:" + hashlib.md5(np.array(wave).tobytes()).hexdigest()