java.io.Serializable
, CapabilitiesHandler
, CapabilitiesIgnorer
, CommandlineRunnable
, OptionHandler
, RevisionHandler
public class TweetToSparseFeatureVector extends TweetToFeatureVector
@Article{NRCJAIR14, Title = {Sentiment analysis of short informal texts}, Author = {Kiritchenko, Svetlana and Zhu, Xiaodan and Mohammad, Saif M}, Journal = {Journal of Artificial Intelligence Research}, Year = {2014}, Pages = {723--762}, Volume = {50} }
Modifier and Type | Field | Description |
---|---|---|
static java.lang.String |
RESOURCES_FOLDER_NAME |
Default path to where resources are stored.
|
Constructor | Description |
---|---|
TweetToSparseFeatureVector() |
Modifier and Type | Method | Description |
---|---|---|
it.unimi.dsi.fastutil.objects.Object2IntMap<java.lang.String> |
calculateDocVec(java.lang.String content) |
Calculates a vector of attributes from a String
|
int |
getCharNgramMaxDim() |
|
int |
getCharNgramMinDim() |
|
int |
getClustNgramMaxDim() |
|
int |
getMinAttDocs() |
|
int |
getPosNgramMaxDim() |
|
java.util.List<java.lang.String> |
getPOStags(java.util.List<java.lang.String> tokens) |
Returns POS tags from a List of tokens using the CMU TweetNLP tool
|
java.io.File |
getTaggerFile() |
|
TechnicalInformation |
getTechnicalInformation() |
Returns an instance of a TechnicalInformation object, containing
detailed information about the technical background of this class,
e.g., paper reference or book this class is based on.
|
java.io.File |
getWordClustFile() |
|
int |
getWordNgramMaxDim() |
|
java.lang.String |
globalInfo() |
Returns a string describing this filter.
|
void |
initializeTagger() |
Initializes the POS tagger
|
void |
initiliazeNegationEvaluator() |
Initializes the NegationEvaluator object
|
boolean |
isCalculateCharNgram() |
|
boolean |
isFreqWeights() |
|
boolean |
isNegateTokens() |
|
static void |
main(java.lang.String[] args) |
Main method for testing this class.
|
void |
setCalculateCharNgram(boolean calculateCharNgram) |
|
void |
setCharNgramMaxDim(int charNgramMaxDim) |
|
void |
setCharNgramMinDim(int charNgramMinDim) |
|
void |
setClustNgramMaxDim(int clustNgramMaxDim) |
|
void |
setFreqWeights(boolean freqWeights) |
|
void |
setMinAttDocs(int minAttDocs) |
|
void |
setNegateTokens(boolean negateTokens) |
|
void |
setPosNgramMaxDim(int posNgramMaxDim) |
|
void |
setTaggerFile(java.io.File taggerFile) |
|
void |
setWordClustFile(java.io.File wordClustFile) |
|
void |
setWordNgramMaxDim(int wordNgramMaxDim) |
|
void |
tweetsToVectors(Instances tweetInstances) |
Processes a batch of tweets.
|
batchFilterFile, debugTipText, doNotCheckCapabilitiesTipText, filterFile, getCapabilities, getCopyOfInputFormat, getDebug, getDoNotCheckCapabilities, getOutputFormat, getRevision, isFirstBatchDone, isNewBatch, isOutputFormatDefined, makeCopies, makeCopy, mayRemoveInstanceAfterFirstBatchDone, numPendingOutput, output, outputPeek, postExecution, preExecution, run, runFilter, setDebug, setDoNotCheckCapabilities, toString, useFilter, wekaStaticWrapper
equals, getClass, hashCode, notify, notifyAll, wait, wait, wait
batchFinished, input
setInputFormat
allowAccessToFullInputFormat, getCapabilities, getOptions, getStemmer, getStopwordsHandler, getTextIndex, getTokenizer, isReduceRepeatedLetters, isStandarizeUrlsUsers, isToLowerCase, listOptions, setOptions, setReduceRepeatedLetters, setStandarizeUrlsUsers, setStemmer, setStopwordsHandler, setTextIndex, setTokenizer, setToLowerCase
public static java.lang.String RESOURCES_FOLDER_NAME
public java.lang.String globalInfo()
globalInfo
in class SimpleFilter
public TechnicalInformation getTechnicalInformation()
public void initializeTagger()
public void initiliazeNegationEvaluator()
public java.util.List<java.lang.String> getPOStags(java.util.List<java.lang.String> tokens)
tokens
- the input tokenspublic it.unimi.dsi.fastutil.objects.Object2IntMap<java.lang.String> calculateDocVec(java.lang.String content)
content
- the inputpublic void tweetsToVectors(Instances tweetInstances)
tweetInstances
- the input tweets@OptionMetadata(displayName="minAttDocs", description="Minimum frequency of a sparse attribute to be considered in the attribute space.", commandLineParamName="M", commandLineParamSynopsis="-M <int>", displayOrder=6) public int getMinAttDocs()
public void setMinAttDocs(int minAttDocs)
@OptionMetadata(displayName="freqWeights", description="True if the value of each feature is set to its frequency in the tweet. Boolean weights are used otherwise.\n", commandLineParamIsFlag=true, commandLineParamName="F", commandLineParamSynopsis="-F", displayOrder=7) public boolean isFreqWeights()
public void setFreqWeights(boolean freqWeights)
@OptionMetadata(displayName="wordNgramMaxDim", description="Maximum size for the word n-gram features. \n\t Set this variable to zero for no word n-gram attributes. All word n-grams from i=1 to this value will be extracted.", commandLineParamName="Q", commandLineParamSynopsis="-Q <int>", displayOrder=8) public int getWordNgramMaxDim()
public void setWordNgramMaxDim(int wordNgramMaxDim)
@OptionMetadata(displayName="negateTokens", description="Add a prefix to words occurring in negated contexts e.g., I don\'t like you => I don\'t NEG-like NEG-you.\n \t The prefixes only affect word n-gram features. The scope of negation finishes with the next punctuation mark.", commandLineParamIsFlag=true, commandLineParamName="R", commandLineParamSynopsis="-R", displayOrder=9) public boolean isNegateTokens()
public void setNegateTokens(boolean negateTokens)
@OptionMetadata(displayName="calculateCharNgram", description="Calculate character n-gram features.", commandLineParamIsFlag=true, commandLineParamName="A", commandLineParamSynopsis="-A", displayOrder=10) public boolean isCalculateCharNgram()
public void setCalculateCharNgram(boolean calculateCharNgram)
@OptionMetadata(displayName="charNgramMinDim", description="The minimum dimension for character n-grams.", commandLineParamName="D", commandLineParamSynopsis="-D <int>", displayOrder=11) public int getCharNgramMinDim()
public void setCharNgramMinDim(int charNgramMinDim)
@OptionMetadata(displayName="charNgramMaxDim", description="The maximum dimension for character n-grams.", commandLineParamName="E", commandLineParamSynopsis="-E <int>", displayOrder=12) public int getCharNgramMaxDim()
public void setCharNgramMaxDim(int charNgramMaxDim)
@OptionMetadata(displayName="posNgramMaxDim", description="The maximum size for POS n-grams. Set this variable to zero for no POS attributes. \n\t The tweets are POS-tagged using the CMU TweetNLP tool.", commandLineParamName="G", commandLineParamSynopsis="-G <int>", displayOrder=13) public int getPosNgramMaxDim()
public void setPosNgramMaxDim(int posNgramMaxDim)
@OptionMetadata(displayName="clustNgramMaxDim", description="The maximum dimension for n-grams calculated with Brown word clusters.\n\t Set this variable to zero for no word-clusters attributes. \n\t The word clusters are taken from the CMU Tweet NLP tool.", commandLineParamName="I", commandLineParamSynopsis="-I <int>", displayOrder=14) public int getClustNgramMaxDim()
public void setClustNgramMaxDim(int clustNgramMaxDim)
@OptionMetadata(displayName="taggerFile", description="The file with TweetNLP POS tagger model.", commandLineParamName="taggerFile", commandLineParamSynopsis="-taggerFile <string>", displayOrder=15) public java.io.File getTaggerFile()
public void setTaggerFile(java.io.File taggerFile)
@OptionMetadata(displayName="wordClustFile", description="The file with the word clusters in gzip format.", commandLineParamName="wordClustFile", commandLineParamSynopsis="-wordClustFile <string>", displayOrder=16) public java.io.File getWordClustFile()
public void setWordClustFile(java.io.File wordClustFile)
public static void main(java.lang.String[] args)
args
- should contain arguments to the filter: use -h for help