Spaces:

lihuigu
/

SciPIP

Running

App Files Files Community

SciPIP / src /utils /scipdf /features /text_utils.py

lihuigu

update structure

cee6a24 6 months ago

raw

history blame contribute delete

7.42 kB

	import numpy as np
	import pandas as pd
	import textstat
	import spacy
	from collections import Counter
	from itertools import groupby


	nlp = spacy.load("en_core_web_sm")

	PRESENT_TENSE_VERB_LIST = ["VB", "VBP", "VBZ", "VBG"]
	VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"]
	NOUN_LIST = ["NNP", "NNPS"]


	SECTIONS_MAPS = {
	"Authors": "Authors",
	"AUTHORS": "AUTHORS",
	"Abstract": "Abstract",
	"ABSTRACT": "Abstract",
	"Date": "Date",
	"DATE": "DATE",
	"INTRODUCTION": "Introduction",
	"MATERIALS AND METHODS": "Methods",
	"Materials and methods": "Methods",
	"METHODS": "Methods",
	"RESULTS": "Results",
	"CONCLUSIONS": "Conclusions",
	"CONCLUSIONS AND FUTURE APPLICATIONS": "Conclusions",
	"DISCUSSION": "Discussion",
	"ACKNOWLEDGMENTS": "Acknowledgement",
	"TABLES": "Tables",
	"Tabnles": "Tables",
	"DISCLOSURE": "Disclosure",
	"CONFLICT OF INTEREST": "Disclosure",
	"Acknowledgement": "Acknowledgements",
	}


	def compute_readability_stats(text):
	"""
	Compute reading statistics of the given text
	Reference: https://github.com/shivam5992/textstat

	Parameters
	==========
	text: str, input section or abstract text
	"""
	try:
	readability_dict = {
	"flesch_reading_ease": textstat.flesch_reading_ease(text),
	"smog": textstat.smog_index(text),
	"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
	"coleman_liau_index": textstat.coleman_liau_index(text),
	"automated_readability_index": textstat.automated_readability_index(text),
	"dale_chall": textstat.dale_chall_readability_score(text),
	"difficult_words": textstat.difficult_words(text),
	"linsear_write": textstat.linsear_write_formula(text),
	"gunning_fog": textstat.gunning_fog(text),
	"text_standard": textstat.text_standard(text),
	"n_syllable": textstat.syllable_count(text),
	"avg_letter_per_word": textstat.avg_letter_per_word(text),
	"avg_sentence_length": textstat.avg_sentence_length(text),
	}
	except:
	readability_dict = {
	"flesch_reading_ease": None,
	"smog": None,
	"flesch_kincaid_grade": None,
	"coleman_liau_index": None,
	"automated_readability_index": None,
	"dale_chall": None,
	"difficult_words": None,
	"linsear_write": None,
	"gunning_fog": None,
	"text_standard": None,
	"n_syllable": None,
	"avg_letter_per_word": None,
	"avg_sentence_length": None,
	}
	return readability_dict


	def compute_text_stats(text):
	"""
	Compute part of speech features from a given spacy wrapper of text

	Parameters
	==========
	text: spacy.tokens.doc.Doc, spacy wrapper of the section or abstract text

	Output
	======
	text_stat: dict, part of speech and text features extracted from the given text
	"""
	try:
	pos = dict(Counter([token.pos_ for token in text]))
	pos_tag = dict(
	Counter([token.tag_ for token in text])
	) # detailed part-of-speech

	n_present_verb = sum(
	[v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]
	)
	n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST])

	word_shape = dict(Counter([token.shape_ for token in text])) # word shape
	n_word_per_sents = [len([token for token in sent]) for sent in text.sents]
	n_digits = sum([token.is_digit or token.like_num for token in text])
	n_word = sum(n_word_per_sents)
	n_sents = len(n_word_per_sents)
	text_stats_dict = {
	"pos": pos,
	"pos_tag": pos_tag,
	"word_shape": word_shape,
	"n_word": n_word,
	"n_sents": n_sents,
	"n_present_verb": n_present_verb,
	"n_verb": n_verb,
	"n_digits": n_digits,
	"percent_digits": n_digits / n_word,
	"n_word_per_sents": n_word_per_sents,
	"avg_word_per_sents": np.mean(n_word_per_sents),
	}
	except:
	text_stats_dict = {
	"pos": None,
	"pos_tag": None,
	"word_shape": None,
	"n_word": None,
	"n_sents": None,
	"n_present_verb": None,
	"n_verb": None,
	"n_digits": None,
	"percent_digits": None,
	"n_word_per_sents": None,
	"avg_word_per_sents": None,
	}
	return text_stats_dict


	def compute_journal_features(article):
	"""
	Parse features about journal references from a given dictionary of parsed article e.g.
	number of reference made, number of unique journal refered, minimum year of references,
	maximum year of references, ...

	Parameters
	==========
	article: dict, article dictionary parsed from GROBID and converted to dictionary
	see ``pdf/parse_pdf.py`` for the detail of the output dictionary

	Output
	======
	reference_dict: dict, dictionary of
	"""
	try:
	n_reference = len(article["references"])
	n_unique_journals = len(
	pd.unique([a["journal"] for a in article["references"]])
	)
	reference_years = []
	for reference in article["references"]:
	year = reference["year"]
	if year.isdigit():
	# filter outliers
	if int(year) in range(1800, 2100):
	reference_years.append(int(year))
	avg_ref_year = np.mean(reference_years)
	median_ref_year = np.median(reference_years)
	min_ref_year = np.min(reference_years)
	max_ref_year = np.max(reference_years)
	journal_features_dict = {
	"n_reference": n_reference,
	"n_unique_journals": n_unique_journals,
	"avg_ref_year": avg_ref_year,
	"median_ref_year": median_ref_year,
	"min_ref_year": min_ref_year,
	"max_ref_year": max_ref_year,
	}
	except:
	journal_features_dict = {
	"n_reference": None,
	"n_unique_journals": None,
	"avg_ref_year": None,
	"median_ref_year": None,
	"min_ref_year": None,
	"max_ref_year": None,
	}
	return journal_features_dict


	def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""):
	"""
	Merge a list of sections into a normalized list of sections,
	you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g.

	>> section_list = [s['heading'] for s in article_json['sections']]
	>> section_list_merged = merge_section_list(section_list)

	Parameters
	==========
	section_list: list, list of sections

	Output
	======
	section_list_merged: list, sections
	"""
	sect_map = section_start # text for starting section e.g. ``Introduction``
	section_list_merged = []
	for section in section_list:
	if any([(s.lower() in section.lower()) for s in section_maps.keys()]):
	sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0]
	sect_map = section_maps.get(sect, "") #
	section_list_merged.append(sect_map)
	else:
	section_list_merged.append(sect_map)
	return section_list_merged