Spaces:

nvidia
/

P2A-test-NV

Sleeping

P2A-test-NV / vecalign /standalone_document_embedding_demo.py

KuangDW

add alignment and specify encoder

dd05f29 14 days ago

3.25 kB


	#!/usr/bin/env python3

	"""
	Copyright 2019 Brian Thompson

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	https://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.


	This is a standalone example of creating a document vector from sentence vectors
	following https://aclanthology.org/2020.emnlp-main.483

	"""


	import numpy as np

	from mcerp import PERT # pip install mcerp # see https://github.com/tisimst/mcerp/blob/master/mcerp/__init__.py


	NUM_TIME_SLOTS = 16
	PERT_G = 20


	# PERT is very slow (50ms per distribution) so we cache a bank of PERT distributions
	_num_banks = 100
	_xx = np.linspace(start=0, stop=1, num=NUM_TIME_SLOTS)
	PERT_BANKS = []
	for _pp in np.linspace(0, 1, num=_num_banks):
	if _pp == 0.5: # some special case that makes g do nothing
	_pp += 0.001
	pert = PERT(low=-0.001, peak=_pp, high=1.001, g=PERT_G, tag=None)
	_yy = pert.rv.pdf(_xx)
	_yy = _yy / sum(_yy) # normalize
	PERT_BANKS.append(_yy)


	np.set_printoptions(threshold=50, precision=5)


	def build_doc_embedding(sent_vecs, sent_counts):
	# ensure sentence counds are >= 1
	sent_counts = np.clip(sent_counts, a_min=1, a_max=None)

	# scale each sent vec by 1/count
	sent_weights = 1.0/np.array(sent_counts)

	scaled_sent_vecs = np.multiply(sent_vecs.T, sent_weights).T

	# equally space sentences
	sent_centers = np.linspace(0, 1, len(scaled_sent_vecs))

	# find weighting for each sentence, for each time slot
	sentence_loc_weights = np.zeros((len(sent_centers), NUM_TIME_SLOTS))

	for sent_ii, p in enumerate(sent_centers):
	bank_idx = int(p * (len(PERT_BANKS) - 1)) # find the nearest cached pert distribution
	sentence_loc_weights[sent_ii, :] = PERT_BANKS[bank_idx]

	# make each chunk vector
	doc_chunk_vec = np.matmul(scaled_sent_vecs.T, sentence_loc_weights).T

	# concatenate chunk vectors into a single vector for the full document
	doc_vec = doc_chunk_vec.flatten()

	# normalize document vector
	doc_vec = doc_vec / (np.linalg.norm(doc_vec) + 1e-5)

	return doc_vec


	def demo():

	# Replace sent_vecs with laser/LaBSE/etc embeddings of each sentence in your document,
	# after projecting the sentence embeddings into a lower-dimensional space using something like PCA (see paper for details).
	sent_emb_size = 32 # Document embedding size will be sent_emb_size * NUM_TIME_SLOTS
	n_sents = 7
	sent_vecs = np.random.rand(n_sents, sent_emb_size)-0.5

	# Replace sent_counts with the number of times each sentence has been seen in your corpus.
	sent_counts = np.random.randint(low=1, high=50, size=n_sents)

	doc_emb = build_doc_embedding(sent_vecs, sent_counts)

	print('Document Embedding:', doc_emb)
	print('Document Embedding Size:', doc_emb.shape)


	if __name__ == '__main__':
	demo()