File size: 4,505 Bytes
e0db39e
 
 
 
0946447
e0db39e
 
 
 
 
 
 
 
 
 
d1a2df2
 
 
 
 
 
 
 
 
 
 
 
 
 
e0db39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d2d9db
 
 
 
 
 
 
 
 
 
 
e0db39e
 
 
 
 
d1a2df2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0946447
 
e0db39e
 
 
 
 
0946447
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import re
import json

import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English

gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))

nlp = English()
nlp.add_pipe("sentencizer")


def call_multiprocessing_pool(df_text):
    concurrent = 2000
    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
    pool.close()

    flat_return_list = [item for sublist in result_list for item in sublist]

    cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
    return_df = pd.DataFrame(flat_return_list, columns=cols)

    return return_df


def get_split_text(text):
    doc = nlp(text)
    sentences = [sent for sent in doc.sents]
    return sentences


def compile_regex_patterns(patterns):
    return [
        re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
        for pattern in patterns
    ]


def get_gender_prof_match_details(df_text):
    male_pronouns = gender_lexicons.get("male_pronouns")
    female_pronouns = gender_lexicons.get("female_pronouns")
    professions = profession_lexicons.get("professions")

    male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
        [male_pronouns, female_pronouns, professions]
    )

    split_text = get_split_text(df_text)

    results = []

    for text in split_text:
        male_pronoun_match = re.findall(male_pronoun_pat, str(text))
        female_pronoun_match = re.findall(female_pronoun_pat, str(text))

        prof_match = re.findall(professions_pat, str(text))

        both_match = "No"

        if len(male_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        if len(female_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        male_pronoun_match = ",".join(male_pronoun_match)
        female_pronoun_match = ",".join(female_pronoun_match)

        prof_match = ",".join(prof_match)

        results.append(
            (
                str(text),
                male_pronoun_match,
                female_pronoun_match,
                prof_match,
                both_match,
            )
        )

    return results


def get_statistics(result):
    stats = {
        "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
        "count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
        "count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
        "count_male_pronoun_profession": str(
            ((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "count_female_pronoun_profession": str(
            ((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "total_sentence": str(len(result)),
    }

    return stats


def get_plot(result_json):
    both_gender_prof_match = int(result_json["both_gender_prof_match"])
    count_male_pronoun = int(result_json["count_male_pronoun"])
    count_female_pronoun = int(result_json["count_female_pronoun"])
    count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
    count_female_pronoun_profession = int(
        result_json["count_female_pronoun_profession"]
    )

    data = {
        "Labels": [
            "Both Gender & Profession Match",
            "Male Pronoun",
            "Female Pronoun",
            "Male Pronoun & Profession",
            "Female Pronoun & Profession",
        ],
        "Values": [
            both_gender_prof_match,
            count_male_pronoun,
            count_female_pronoun,
            count_male_pronoun_profession,
            count_female_pronoun_profession,
        ],
    }

    fig = px.pie(
        data,
        names="Labels",
        values="Values",
        title="Gender & Profession Match Statistics",
    )

    return fig


def eval_gender_profession(data):
    data = data[data.columns[0]].str.lower().str.strip()

    result = call_multiprocessing_pool(data)

    result_json = get_statistics(result)
    result_plot = get_plot(result_json)

    result_df = (
        pd.DataFrame.from_dict(result_json, orient="index")
        .reset_index()
        .rename(columns={"index": "Metric", 0: "Value"})
    )

    result_conclusion = ""

    return result_df, result_plot, result_conclusion