mathtext

Runtime error

App Files Files Community

cetinca commited on Jan 4, 2023

Commit

43ac953

verified ·

1 Parent(s): 72f4be8

New version text2int

Browse files

Files changed (5) hide show

.gitignore +3 -0
app.py +155 -143
plot_calls.py +23 -3
test_api.py +3 -3
test_api.sh +41 -24

.gitignore CHANGED Viewed

@@ -100,3 +100,6 @@ docs/**/*.html
 **/*private*
 /call_history.csv
 /call_history.txt

 **/*private*
 /call_history.csv
 /call_history.txt
+/output.csv
+/call_history_bash.csv
+/call_history_sentiment_bash.csv

app.py CHANGED Viewed

@@ -1,20 +1,15 @@
-import inspect
-import json
-import logging
-import os
-from typing import List, Type
 import gradio as gr
 import spacy  # noqa
-from dotenv import load_dotenv
-from gradio import routes
 from transformers import pipeline
-load_dotenv()
-TOKENS2INT_ERROR_INT = 32202
-log = logging.getLogger()
 ONES = [
     "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
@@ -22,20 +17,55 @@ ONES = [
     "sixteen", "seventeen", "eighteen", "nineteen",
 ]
-# token_mapping = json.load(open('str_mapping.json'))
 CHAR_MAPPING = {
     "-": " ",
     "_": " ",
 }
-CHAR_MAPPING.update((str(i), word) for i, word in enumerate([" " + s + " " for s in ONES]))
-TOKEN_MAPPING = dict(enumerate([" " + s + " " for s in ONES]))
-BQ_JSON = os.environ['BQ_JSON']
 def tokenize(text):
-    return text.split()
 def detokenize(tokens):
@@ -47,96 +77,122 @@ def replace_tokens(tokens, token_mapping=TOKEN_MAPPING):
 def replace_chars(text, char_mapping=CHAR_MAPPING):
-    return ''.join((char_mapping.get(c, c) for c in text))
-def tokens2int(tokens, numwords={}):
-    """ Convert an English str containing number words into an int
-    >>> text2int("nine")
-    9
-    >>> text2int("forty two")
-    42
-    >>> text2int("1 2 three")
-    123
-    """
-    if not numwords:
-        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
-        scales = ["hundred", "thousand", "million", "billion", "trillion"]
-        numwords["and"] = (1, 0)
         for idx, word in enumerate(ONES):
-            numwords[word] = (1, idx)
         for idx, word in enumerate(tens):
-            numwords[word] = (1, idx * 10)
         for idx, word in enumerate(scales):
-            numwords[word] = (10 ** (idx * 3 or 2), 0)
-    current = result = 0
-    for word in tokens:
-        if word not in numwords:
-            raise Exception("Illegal word: " + word)
-        scale, increment = numwords[word]
-        current = current * scale + increment
-        if scale > 100:
-            result += current
-            current = 0
-    return str(result + current)
-def text2int(text):
-    return tokens2int(tokenize(replace_chars(text)))
-def try_text2int(text):
-    text = str(text)
     try:
-        intstr = tokens2int(tokens2int(tokenize(replace_chars(text))))
-    except Exception as e:
-        log.error(str(e))
-        log.error(f'User input: {text}')
-        intstr = TOKENS2INT_ERROR_INT
-    return str(intstr)
-def try_text2int_preprocessed(text):
-    text = str(text)
-    try:
-        tokens = replace_tokens(tokenize(replace_chars(str(text))))
-    except Exception as e:
-        log.error(str(e))
-        tokens = text.split()
-    try:
-        intstr = tokens2int(tokens)
-    except Exception as e:
-        log.error(str(e))
-        intstr = str(TOKENS2INT_ERROR_INT)
-    return intstr
-def get_types(cls_set: List[Type], component: str):
-    docset = []
     types = []
-    if component == "input":
-        for cls in cls_set:
-            doc = inspect.getdoc(cls)
-            doc_lines = doc.split("\n")
-            docset.append(doc_lines[1].split(":")[-1])
-            types.append(doc_lines[1].split(")")[0].split("(")[-1])
     else:
-        for cls in cls_set:
-            doc = inspect.getdoc(cls)
-            doc_lines = doc.split("\n")
-            docset.append(doc_lines[-1].split(":")[-1])
-            types.append(doc_lines[-1].split(")")[0].split("(")[-1])
-    return docset, types
-routes.get_types = get_types
 sentiment = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
@@ -159,7 +215,7 @@ with gr.Blocks() as html_block:
         button_text2int = gr.Button("text2int")
         button_text2int.click(
-            fn=try_text2int,
             inputs=inputs_text2int,
             outputs=outputs_text2int,
             api_name="text2int",
@@ -173,7 +229,6 @@ with gr.Blocks() as html_block:
         gr.Examples(examples=examples_text2int, inputs=inputs_text2int)
         gr.Markdown(r"""
         ## API
         ```python
         import requests
@@ -188,49 +243,7 @@ with gr.Blocks() as html_block:
         ```bash
         curl -X POST https://tangibleai-mathtext.hf.space/run/text2int -H 'Content-Type: application/json' -d '{"data": ["one hundred forty five"]}'
         ```
-        {bq_json}""" + f"{json.loads(BQ_JSON)['type']}")
-    with gr.Tab("Text to integer preprocessed"):
-        inputs_text2int_preprocessed = [
-            gr.Text(placeholder="Type a number as text or a sentence", label="Text to process",
-                    value="forty two"),
-        ]
-        outputs_text2int_preprocessed = gr.Textbox(label="Output integer")
-        button_text2int = gr.Button("text2int preprocessed")
-        button_text2int.click(
-            fn=try_text2int_preprocessed,
-            inputs=inputs_text2int_preprocessed,
-            outputs=outputs_text2int_preprocessed,
-            api_name="text2int-preprocessed",
-        )
-        examples_text2int_preprocessed = [
-            "one thousand forty seven",
-            "one hundred",
-        ]
-        gr.Examples(examples=examples_text2int_preprocessed, inputs=inputs_text2int_preprocessed)
-        gr.Markdown(r"""
-        ## API
-        ```python
-        import requests
-        requests.post(
-            url="https://tangibleai-mathtext.hf.space/run/text2int-preprocessed", json={"data": ["one hundred forty five"]}
-        ).json()
-        ```
-        Or using `curl`:
-        ```bash
-        curl -X POST https://tangibleai-mathtext.hf.space/run/text2int-preprocessed -H 'Content-Type: application/json' -d '{"data": ["one hundred forty five"]}'
-        ```
-        {bq_json}""" + f"{json.loads(BQ_JSON)['type']}")
     with gr.Tab("Sentiment Analysis"):
         inputs_sentiment = [
@@ -257,7 +270,6 @@ with gr.Blocks() as html_block:
         gr.Examples(examples=examples_sentiment, inputs=inputs_sentiment)
         gr.Markdown(r"""
         ## API
         ```python
         import requests
@@ -272,7 +284,7 @@ with gr.Blocks() as html_block:
         ```bash
         curl -X POST https://tangibleai-mathtext.hf.space/run/sentiment-analysis -H 'Content-Type: application/json' -d '{"data": ["You are right!"]}'
         ```
-        {bq_json}""" + f"{json.loads(BQ_JSON)['type']}")
 # interface = gr.Interface(lambda x: x, inputs=["text"], outputs=["text"])
 # html_block.input_components = interface.input_components

 import gradio as gr
 import spacy  # noqa
 from transformers import pipeline
+# import os
+# os.environ['KMP_DUPLICATE_LIB_OK']='True'
+# import spacy
+# Change this according to what words should be corrected to
+SPELL_CORRECT_MIN_CHAR_DIFF = 2
+TOKENS2INT_ERROR_INT = 32202
 ONES = [
     "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
     "sixteen", "seventeen", "eighteen", "nineteen",
 ]
 CHAR_MAPPING = {
     "-": " ",
     "_": " ",
+    "and": " ",
+}
+# CHAR_MAPPING.update((str(i), word) for i, word in enumerate([" " + s + " " for s in ONES]))
+TOKEN_MAPPING = {
+    "and": " ",
+    "oh": "0",
 }
+def find_char_diff(a, b):
+    # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
+    char_counts_a = {}
+    char_counts_b = {}
+    for char in a:
+        if char in char_counts_a.keys():
+            char_counts_a[char] += 1
+        else:
+            char_counts_a[char] = 1
+    for char in b:
+        if char in char_counts_b.keys():
+            char_counts_b[char] += 1
+        else:
+            char_counts_b[char] = 1
+    char_diff = 0
+    for i in char_counts_a:
+        if i in char_counts_b.keys():
+            char_diff += abs(char_counts_a[i] - char_counts_b[i])
+        else:
+            char_diff += char_counts_a[i]
+    return char_diff
 def tokenize(text):
+    text = text.lower()
+    # print(text)
+    text = replace_tokens(''.join(i for i in replace_chars(text)).split())
+    # print(text)
+    text = [i for i in text if i != ' ']
+    # print(text)
+    output = []
+    for word in text:
+        # print(word)
+        output.append(convert_word_to_int(word))
+    output = [i for i in output if i != ' ']
+    # print(output)
+    return output
 def detokenize(tokens):
 def replace_chars(text, char_mapping=CHAR_MAPPING):
+    return [char_mapping.get(c, c) for c in text]
+def convert_word_to_int(in_word, numwords={}):
+    # Converts a single word/str into a single int
+    tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
+    scales = ["hundred", "thousand", "million", "billion", "trillion"]
+    if not numwords:
         for idx, word in enumerate(ONES):
+            numwords[word] = idx
         for idx, word in enumerate(tens):
+            numwords[word] = idx * 10
         for idx, word in enumerate(scales):
+            numwords[word] = 10 ** (idx * 3 or 2)
+    if in_word in numwords:
+        # print(in_word)
+        # print(numwords[in_word])
+        return numwords[in_word]
     try:
+        int(in_word)
+        return int(in_word)
+    except ValueError:
+        pass
+    # Spell correction using find_char_diff
+    char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
+    min_char_diff = min(char_diffs)
+    if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
+        return char_diffs.index(min_char_diff)
+def tokens2int(tokens):
+    # Takes a list of tokens and returns a int representation of them
     types = []
+    for i in tokens:
+        if i <= 9:
+            types.append(1)
+        elif i <= 90:
+            types.append(2)
+        else:
+            types.append(3)
+    # print(tokens)
+    if len(tokens) <= 3:
+        current = 0
+        for i, number in enumerate(tokens):
+            if i != 0 and types[i] < types[i - 1] and current != tokens[i - 1] and types[i - 1] != 3:
+                current += tokens[i] + tokens[i - 1]
+            elif current <= tokens[i] and current != 0:
+                current *= tokens[i]
+            elif 3 not in types and 1 not in types:
+                current = int(''.join(str(i) for i in tokens))
+                break
+            elif '111' in ''.join(str(i) for i in types) and 2 not in types and 3 not in types:
+                current = int(''.join(str(i) for i in tokens))
+                break
+            else:
+                current += number
+    elif 3 not in types and 2 not in types:
+        current = int(''.join(str(i) for i in tokens))
     else:
+        """
+        double_list = []
+        current_double = []
+        double_type_list = []
+        for i in tokens:
+            if len(current_double) < 2:
+                current_double.append(i)
+            else:
+                double_list.append(current_double)
+                current_double = []
+        current_double = []
+        for i in types:
+            if len(current_double) < 2:
+                current_double.append(i)
+            else:
+                double_type_list.append(current_double)
+                current_double = []
+        print(double_type_list)
+        print(double_list)
+        current = 0
+        for i, type_double in enumerate(double_type_list):
+            if len(type_double) == 1:
+                current += double_list[i][0]
+            elif type_double[0] == type_double[1]:
+                current += int(str(double_list[i][0]) + str(double_list[i][1]))
+            elif type_double[0] > type_double[1]:
+                current += sum(double_list[i])
+            elif type_double[0] < type_double[1]:
+                current += double_list[i][0] * double_list[i][1]
+        #print(current)
+        """
+        count = 0
+        current = 0
+        for i, token in enumerate(tokens):
+            count += 1
+            if count == 2:
+                if types[i - 1] == types[i]:
+                    current += int(str(token) + str(tokens[i - 1]))
+                elif types[i - 1] > types[i]:
+                    current += tokens[i - 1] + token
+                else:
+                    current += tokens[i - 1] * token
+                count = 0
+            elif i == len(tokens) - 1:
+                current += token
+    return current
+def text2int(text):
+    # Wraps all of the functions up into one
+    return tokens2int(tokenize(text))
 sentiment = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
         button_text2int = gr.Button("text2int")
         button_text2int.click(
+            fn=text2int,
             inputs=inputs_text2int,
             outputs=outputs_text2int,
             api_name="text2int",
         gr.Examples(examples=examples_text2int, inputs=inputs_text2int)
         gr.Markdown(r"""
         ## API
         ```python
         import requests
         ```bash
         curl -X POST https://tangibleai-mathtext.hf.space/run/text2int -H 'Content-Type: application/json' -d '{"data": ["one hundred forty five"]}'
         ```
+        """)
     with gr.Tab("Sentiment Analysis"):
         inputs_sentiment = [
         gr.Examples(examples=examples_sentiment, inputs=inputs_sentiment)
         gr.Markdown(r"""
         ## API
         ```python
         import requests
         ```bash
         curl -X POST https://tangibleai-mathtext.hf.space/run/sentiment-analysis -H 'Content-Type: application/json' -d '{"data": ["You are right!"]}'
         ```
+        """)
 # interface = gr.Interface(lambda x: x, inputs=["text"], outputs=["text"])
 # html_block.input_components = interface.input_components

plot_calls.py CHANGED Viewed

@@ -1,9 +1,29 @@
 import matplotlib.pyplot as plt
 import pandas as pd
-df = pd.read_csv('call_history.csv')  # data loading
-print(df)
-df.plot(by='endpoint', column='delay', kind='box', showmeans=True)
 plt.show()

+from datetime import datetime
 import matplotlib.pyplot as plt
 import pandas as pd
+# pd.set_option('display.max_columns', None)
+# pd.set_option('display.max_rows', None)
+df = pd.read_csv(filepath_or_buffer='call_history_bash.csv', sep=";")
+df["elapsed"] = df["finished"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")) - df["started"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"))
+df["elapsed"] = df["elapsed"].apply(lambda x: x.total_seconds())
+df.to_csv("output.csv", index=False, sep=";")
+student_numbers = df['active_students'].unique()
+plt.figure(figsize=(16, 10))
+rows = len(student_numbers)
+for index, student_number in enumerate(student_numbers, 1):
+    data = df[df["active_students"] == student_number]
+    plt.subplot(rows, 2, 2 * index - 1)
+    plt.title("y=seconds, x=active students", x=0.75, y=0.75)
+    plt.boxplot(x=data["elapsed"], labels=[student_number])
+    plt.subplot(rows, 2, 2 * index)
+    plt.title("y=count of seconds, x=seconds", x=0.75, y=0.75)
+    plt.hist(x=data["elapsed"], bins=25, edgecolor='white')
 plt.show()

test_api.py CHANGED Viewed

@@ -7,12 +7,12 @@ import pandas as pd
 import httpx
 from os.path import exists
-NUMBER_OF_CALLS = 20
 headers = {"Content-Type": "application/json; charset=utf-8"}
-base_url = "https://tangibleai-mathtext.hf.space/run/{endpoint}"
-# base_url = "http://localhost:7860/run/{endpoint}"
 data_list_1 = {
     "endpoint": "text2int",

 import httpx
 from os.path import exists
+NUMBER_OF_CALLS = 1
 headers = {"Content-Type": "application/json; charset=utf-8"}
+# base_url = "https://tangibleai-mathtext.hf.space/run/{endpoint}"
+base_url = "http://localhost:7860/run/{endpoint}"
 data_list_1 = {
     "endpoint": "text2int",

test_api.sh CHANGED Viewed

@@ -1,5 +1,13 @@
 #! /bin/env bash
 data_list_1() {
   responses=(
     "one hundred forty five"
@@ -22,39 +30,48 @@ data_list_2() {
   echo "${responses[$1]}"
 }
-text2int="https://tangibleai-mathtext.hf.space/run/text2int"
-text2intpreprocessed="https://tangibleai-mathtext.hf.space/run/text2int-preprocessed"
-sentimentanalysis="https://tangibleai-mathtext.hf.space/run/sentiment-analysis"
-test_endpoint() {
-  start_=$(date +%s.%N)
-  response=$(curl --silent -X POST "$1" -H 'Content-Type: application/json' -d "$2")
-  end_=$(date +%s.%N)
-  diff=$(echo "$end_ - $start_" | bc)
-  printf " endpoint:%s\n data:%s delay:%s:\n %s\n" "$1" "$2" "$diff" "$response"
 }
-echo "start: $(date)"
-for i in {1..20}; do
-  random_value=$((RANDOM % 5))
-  text=$(data_list_1 $random_value)
-  data='{"data": ["'$text'"]}'
-  test_endpoint "$text2int" "$data" >>call_history.txt &
-done
-for i in {1..20}; do
-  random_value=$((RANDOM % 5))
-  text=$(data_list_1 $random_value)
-  data='{"data": ["'$text'"]}'
-  test_endpoint "$text2intpreprocessed" "$data" >>call_history.txt &
-done
-for i in {1..20}; do
   random_value=$((RANDOM % 5))
   text=$(data_list_2 $random_value)
   data='{"data": ["'$text'"]}'
-  test_endpoint "$sentimentanalysis" "$data" >>call_history.txt &
 done
 wait

 #! /bin/env bash
+LOG_FILE_NAME="call_history_bash.csv"
+if [[ ! -f "$LOG_FILE_NAME" ]]; then
+    # Creation of column names if the file does not exits
+    echo "student_id;active_students;endpoint;inputs;outputs;started;finished" > $LOG_FILE_NAME
+fi
 data_list_1() {
   responses=(
     "one hundred forty five"
   echo "${responses[$1]}"
 }
+# endpoints: "text2int" "text2int-preprocessed" "sentiment-analysis"
+# selected endpoint to test
+endpoint="text2int"
+create_random_delay () {
+  # creates a random delay for given arguments
+  echo "scale=8; $RANDOM/32768*$1" | bc
 }
+simulate_student() {
+  # Student simulator waits randomly between 0-10s after an interaction.
+  # Based on 100 interactions per student
+  for i in {1..100}; do
+    start_=$(date +"%F %T.%6N")
+    url="https://tangibleai-mathtext.hf.space/run/$3"
+    response=$(curl --silent -X POST "$url" -H 'Content-Type: application/json' -d "$4")
+    if [[ "$response" == *"504"* ]]; then
+      response="504 Gateway Time-out"
+    fi
+    end_=$(date +"%F %T.%6N")
+    printf "%s;%s;%s;%s;%s;%s;%s\n" "$1" "$2" "$3" "$4" "$response" "$start_" "$end_" >>$LOG_FILE_NAME
+    sleep "$(create_random_delay 10)"
+  done
+}
+echo "start: $(date)"
+active_students=250  # the number of students using the system at the same time
+i=1
+while [[ "$i" -le "$active_students" ]]
+do
   random_value=$((RANDOM % 5))
   text=$(data_list_2 $random_value)
   data='{"data": ["'$text'"]}'
+  simulate_student "student$i" "$active_students" "$endpoint" "$data" &
+  sleep "$(create_random_delay 1)"  # adding a random delay between students
+  i=$(( "$i" + 1 ))
 done
 wait