Spring Boot PagingAndSortingRepository search: объединение нескольких параметров для сложного поиска

Шаги для преобразования: Document-> Sentences-> Tokens-> POS-> Lemmas

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#example text text = 'What can I say about this place. The staff of these restaurants is nice and the eggplant is not bad'

class Splitter(object):
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):
        """
        out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']
        """
        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

#step 1 split document into sentence followed by tokenization
tokens = splitter.split(text)

#step 2 lemmatization using pos tagger 
lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
print(lemma_pos_token)
0
задан Max 17 January 2019 в 08:26
поделиться

1 ответ

Вы можете использовать @Query для сложных запросов, которые довольно просты в использовании.

Используя @Query, ваш код будет выглядеть примерно так:

@Query(SELECT v FROM vwFact v WHERE v.applicationId = :applicationId and v.mdName01 = :mdName01)
Page<vwFact> findAllByApplicationIdAndMdName01StartingWithOrApplicationIdAndMdName02StartingWithOrApplicationIdAndMdName03StartingWithOrApplicationIdAndMdName04StartingWith (
    @Param(value = "applicationId") String applicationId,
    @Param(value = "mdName01") String mdName01,
    @Param(value = "mdName02") String mdName02,
    @Param(value = "mdName03") String mdName03,
    @Param(value = "mdName04") String mdName04,
    Pageable pageable

);

Вот хороший учебник для начала.

0
ответ дан mahfuj asif 17 January 2019 в 08:26
поделиться
Другие вопросы по тегам:

Похожие вопросы: