File size: 10,412 Bytes
bddfb4b
96e76ac
 
 
 
 
654128b
96e76ac
 
05fb91e
 
654128b
c4adefd
 
76c5b42
 
 
 
 
c4adefd
 
ddd08df
c4adefd
 
 
 
 
76c5b42
 
 
 
c4adefd
 
76c5b42
 
 
c4adefd
 
 
 
 
76c5b42
c4adefd
 
6ffc297
 
 
 
 
 
654128b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76c5b42
 
c4adefd
76c5b42
 
ecbc96c
96e76ac
 
 
 
8021559
96e76ac
628342c
96e76ac
628342c
96e76ac
 
 
 
654128b
ecbc96c
96e76ac
76c5b42
c4adefd
96e76ac
 
1e081e6
1e80e14
 
96e76ac
8021559
96e76ac
 
 
654128b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4adefd
ecbc96c
266c914
1e80e14
654128b
76c5b42
c4adefd
 
9968bbc
 
 
6ffc297
 
 
 
 
 
9968bbc
6ffc297
9968bbc
 
 
 
 
 
 
 
 
1e80e14
9968bbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76c5b42
266c914
76c5b42
1e081e6
9968bbc
 
 
 
 
 
 
 
 
c4adefd
76c5b42
c4adefd
 
 
 
1e081e6
c4adefd
76c5b42
ecbc96c
76c5b42
 
 
 
 
 
1e80e14
 
c4adefd
 
76c5b42
 
 
 
 
 
1e081e6
c4adefd
 
266c914
 
 
 
 
ecbc96c
266c914
 
 
 
 
 
1e80e14
 
266c914
 
bddfb4b
ddd08df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import gradio as gr
import os
import json
import uuid
from datetime import datetime
import shutil
from huggingface_hub import HfApi, create_repo, upload_file, upload_folder
os.makedirs("uploaded_images", exist_ok=True)
os.makedirs("submissions", exist_ok=True)
HF_TOKEN = os.environ.get("HF_TOKEN") 
DATASET_NAME = "minemaster01/se-culture-dataset-results" 
DATASET_CREATED = False
states_by_country = {
    "India": [
        "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar", "Chhattisgarh", "Goa", "Gujarat", "Haryana",
        "Himachal Pradesh", "Jharkhand", "Karnataka", "Kerala", "Madhya Pradesh", "Maharashtra", "Manipur",
        "Meghalaya", "Mizoram", "Nagaland", "Odisha", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana",
        "Tripura", "Uttar Pradesh", "Uttarakhand", "West Bengal", "Andaman and Nicobar Islands", "Chandigarh",
        "Dadra and Nagar Haveli and Daman and Diu", "Delhi", "Jammu and Kashmir", "Ladakh", "Lakshadweep", "Puducherry"
    ],
    "Pakistan": [
        "Balochistan", "Khyber Pakhtunkhwa", "Punjab", "Sindh", "Islamabad Capital Territory", "Gilgit-Baltistan"
    ],
    "Bangladesh": [
        "Barisal", "Chittagong", "Dhaka", "Khulna", "Mymensingh", "Rajshahi", "Rangpur", "Sylhet"
    ],
    "Afghanistan": [
        "Badakhshan", "Badghis", "Baghlan", "Balkh", "Bamyan", "Daykundi", "Farah", "Faryab", "Ghazni", "Ghor",
        "Helmand", "Herat", "Jowzjan", "Kabul", "Kandahar", "Kapisa", "Khost", "Kunar", "Kunduz", "Laghman",
        "Logar", "Nangarhar", "Nimruz", "Nuristan", "Paktia", "Paktika", "Panjshir", "Parwan", "Samangan",
        "Sar-e Pol", "Takhar", "Uruzgan", "Wardak", "Zabul"
    ],
    "Bhutan": [
        "Bumthang", "Chukha", "Dagana", "Gasa", "Haa", "Lhuentse", "Mongar", "Paro", "Pemagatshel", "Punakha",
        "Samdrup Jongkhar", "Samtse", "Sarpang", "Thimphu", "Trashigang", "Trashiyangtse", "Trongsa", "Tsirang",
        "Wangdue Phodrang", "Zhemgang"
    ],
    "Nepal": [
        "Bagmati", "Gandaki", "Karnali", "Koshi", "Lumbini", "Madhesh", "Sudurpashchim"
    ],
    "Sri Lanka": [
        "Central", "Eastern", "North Central", "Northern", "North Western", "Sabaragamuwa", "Southern", "Uva", "Western"
    ]
}
south_asian_languages = [
    "Hindi", "Bengali", "Urdu", "Punjabi", "Tamil", "Telugu", 
    "Marathi", "Gujarati", "Kannada", "Malayalam", "Odia", 
    "Sindhi", "Nepali", "Sinhala", "Pashto", "Dari", 
    "Dzongkha", "Assamese", "Kashmiri", "Sanskrit", "Other"
]
def setup_hf_dataset():
    """Initialize the Hugging Face dataset if it doesn't exist"""
    global DATASET_CREATED
    if not DATASET_CREATED and HF_TOKEN:
        try:
            api = HfApi()
            create_repo(
                DATASET_NAME,
                repo_type="dataset",
                token=HF_TOKEN,
                exist_ok=True
            )
            DATASET_CREATED = True
            print(f"Dataset {DATASET_NAME} is ready")
        except Exception as e:
            print(f"Error setting up dataset: {e}")
    elif not HF_TOKEN:
        print("Warning: HF_TOKEN not set. Data will be stored locally only.")
def update_state_dropdown(country):
    """Update state dropdown based on selected country"""
    if country in states_by_country:
        return gr.Dropdown(choices=states_by_country[country], label=f"State/Province in {country}:", interactive=True)
    return gr.Dropdown(choices=[], label="State/Province:", interactive=True)
def process_submission(input_img, language, country, state, city, se_asia_relevance, culture_knowledge, native_caption, english_caption,email):
    submission_id = str(uuid.uuid4())
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    image_path = None
    if input_img is not None:
        image_filename = f"{timestamp}.jpg"
        image_path = os.path.join("uploaded_images", image_filename)
        if isinstance(input_img, str):
            shutil.copy(input_img, image_path)
        else:
            input_img.save(image_path)
    submission_data = {
        "id": submission_id,
        "timestamp": timestamp,
        "image_filename": os.path.basename(image_path) if image_path else None,
        "cultural_relevance": language,
        "country": country,
        "state": state,
        "city": city,
        "se_asia_relevance": se_asia_relevance,
        "cultural_knowledge_source": culture_knowledge,
        "native_caption": native_caption,
        "english_caption": english_caption,
        "email": email
    }
    json_filename = f"{timestamp}.json"
    json_path = os.path.join("submissions", json_filename)
    with open(json_path, "w") as f:
        json.dump(submission_data, f, indent=2)
    if HF_TOKEN and DATASET_CREATED:
        try:
            api = HfApi()
            api.upload_file(
                path_or_fileobj=json_path,
                path_in_repo=f"submissions/{json_filename}",
                repo_id=DATASET_NAME,
                repo_type="dataset",
                token=HF_TOKEN
            )
            if image_path and os.path.exists(image_path):
                api.upload_file(
                    path_or_fileobj=image_path,
                    path_in_repo=f"images/{os.path.basename(image_path)}",
                    repo_id=DATASET_NAME,
                    repo_type="dataset",
                    token=HF_TOKEN
                )
            print(f"Submission {submission_id} uploaded to Hugging Face Dataset")
        except Exception as e:
            print(f"Error uploading to dataset: {e}")
    location_info = f"Location: {city}, {state}, {country}" if state else f"Location: {city}, {country}"
    return input_img, f"Your text response: {language}", f"Selected location: {location_info}", f"SE Asia relevance: {se_asia_relevance}", f"Cultural knowledge source: {culture_knowledge}", f"Native caption: {native_caption}", f"English caption: {english_caption}"
def clear_inputs():
    return None, "", None, None, "", None, None, "", "", ""
setup_hf_dataset()
with gr.Blocks() as gradio_app:
    gr.Markdown("# South Asian Image Data Collection")
    gr.Markdown("Upload an image and answer questions about its cultural significance.")
    with gr.Row():
        with gr.Column(scale=1):
            input_img = gr.Image(label="Upload an image", sources=['upload', 'webcam'], type="pil")
            language = gr.Dropdown(
                choices=south_asian_languages,
                label="Language:",
                info="Select the native language relevant to the image",
                interactive=True
            )
            country_dropdown = gr.Dropdown(
                choices=["None","India", "Pakistan", "Bangladesh", "Afghanistan", "Bhutan", "Nepal", "Sri Lanka"],
                label="Country where the image was taken:",
                interactive=True
            )
            state_dropdown = gr.Dropdown(
                choices=[],
                label="State/Province:",
                interactive=True
            )
            city_textbox = gr.Textbox(label="City where the image was taken:", placeholder="Enter city name")
            email_input = gr.Textbox(label="Your Email:", placeholder="Enter your email address", info="Used as unique contributor ID")
        with gr.Column(scale=1):
            se_asia_relevance = gr.Radio(
                choices=[
                    "Yes. Unique to South Asia",
                    "Yes, people will likely think of South Asia when seeing the picture, but it may have low degree of similarity to other cultures.",
                    "Maybe, this culture did not originate from South Asia, but it's quite dominant in South Asia",
                    "Not really. It has some affiliation to South Asia, but actually does not represent South Asia or has stronger affiliation to cultures outside South Asia",
                    "No. Totally unrelated to South Asia"
                ],
                label="Is the image culturally relevant in South Asia?"
            )
            culture_knowledge = gr.Radio(
                choices=[
                    "I'm from this country/culture",
                    "I checked online resources (e.g., Wikipedia, articles, blogs)"
                ],
                label="How do you know about this culture?",
                info="Please do not consult LLMs (e.g., GPT-4o, Claude, Command-R, etc.)"
            )
            native_caption = gr.Textbox(label="Caption in Native Language:", placeholder="Enter caption in the native language of the culture depicted")
            english_caption = gr.Textbox(label="English Caption:", placeholder="Enter caption in English")
    with gr.Row():
        clear_btn = gr.Button("Clear")
        submit_btn = gr.Button("Submit")
    with gr.Row():
        with gr.Column(scale=1):
            output_img = gr.Image(label="Submitted Image")
            output_text = gr.Text(label="Text Response")
            output_location = gr.Text(label="Location Information")
        with gr.Column(scale=1):
            output_relevance = gr.Text(label="South Asia Cultural Relevance")
            output_knowledge = gr.Text(label="Cultural Knowledge Source")
            output_native = gr.Text(label="Native Language Caption")
            output_english = gr.Text(label="English Caption")
    country_dropdown.change(
        fn=update_state_dropdown,
        inputs=country_dropdown,
        outputs=state_dropdown
    )
    submit_btn.click(
        fn=process_submission,
        inputs=[
            input_img,
            language,
            country_dropdown,
            state_dropdown,
            city_textbox,
            se_asia_relevance,
            culture_knowledge,
            native_caption,
            english_caption,
            email_input 
        ],
        outputs=[
            output_img,
            output_text,
            output_location,
            output_relevance,
            output_knowledge,
            output_native,
            output_english
        ]
    )
    clear_btn.click(
        fn=clear_inputs,
        inputs=[],
        outputs=[
            input_img,
            language,
            country_dropdown,
            state_dropdown,
            city_textbox,
            se_asia_relevance,
            culture_knowledge,
            native_caption,
            english_caption,
            email_input 
        ]
    )
if __name__ == "__main__":
    gradio_app.launch()