java.io.Serializable
, CapabilitiesHandler
, CapabilitiesIgnorer
, CommandlineRunnable
, OptionHandler
, RevisionHandler
public class TweetCentroid extends TweetToFeatureVector
Modifier and Type | Field | Description |
---|---|---|
static java.lang.String |
RESOURCES_FOLDER_NAME |
Default path to where resources are stored.
|
Constructor | Description |
---|---|
TweetCentroid() |
Modifier and Type | Method | Description |
---|---|---|
it.unimi.dsi.fastutil.objects.Object2IntMap<java.lang.String> |
calculateDocVec(java.util.List<java.lang.String> tokens) |
Calculates tweet vectors from a list of tokens
|
void |
computeWordVecsAndVoc(Instances inputFormat) |
Calculates the vocabulary and the word vectors from an Instances object
The vocabulary is only extracted the first time the filter is run.
|
int |
getMinAttDocs() |
|
int |
getMinInstDocs() |
|
TechnicalInformation |
getTechnicalInformation() |
Returns an instance of a TechnicalInformation object, containing
detailed information about the technical background of this class,
e.g., paper reference or book this class is based on.
|
java.io.File |
getWordClustFile() |
|
java.lang.String |
globalInfo() |
|
boolean |
isCreateClustAtts() |
|
boolean |
isCreateWordAtts() |
|
boolean |
isFreqWeights() |
|
boolean |
isIncludeMetaData() |
|
static void |
main(java.lang.String[] args) |
Main method for testing this class.
|
void |
setCreateClustAtts(boolean createClustAtts) |
|
void |
setCreateWordAtts(boolean createWordAtts) |
|
void |
setFreqWeights(boolean freqWeights) |
|
void |
setIncludeMetaData(boolean includeMetaData) |
|
void |
setMinAttDocs(int minAttDocs) |
|
void |
setMinInstDocs(int minInstDocs) |
|
void |
setWordClustFile(java.io.File wordClustFile) |
batchFilterFile, debugTipText, doNotCheckCapabilitiesTipText, filterFile, getCapabilities, getCopyOfInputFormat, getDebug, getDoNotCheckCapabilities, getOutputFormat, getRevision, isFirstBatchDone, isNewBatch, isOutputFormatDefined, makeCopies, makeCopy, mayRemoveInstanceAfterFirstBatchDone, numPendingOutput, output, outputPeek, postExecution, preExecution, run, runFilter, setDebug, setDoNotCheckCapabilities, toString, useFilter, wekaStaticWrapper
equals, getClass, hashCode, notify, notifyAll, wait, wait, wait
batchFinished, input
setInputFormat
allowAccessToFullInputFormat, getCapabilities, getOptions, getStemmer, getStopwordsHandler, getTextIndex, getTokenizer, isReduceRepeatedLetters, isStandarizeUrlsUsers, isToLowerCase, listOptions, setOptions, setReduceRepeatedLetters, setStandarizeUrlsUsers, setStemmer, setStopwordsHandler, setTextIndex, setTokenizer, setToLowerCase
public static java.lang.String RESOURCES_FOLDER_NAME
public TechnicalInformation getTechnicalInformation()
public java.lang.String globalInfo()
globalInfo
in class SimpleFilter
public it.unimi.dsi.fastutil.objects.Object2IntMap<java.lang.String> calculateDocVec(java.util.List<java.lang.String> tokens)
tokens
- a tokenized tweetpublic void computeWordVecsAndVoc(Instances inputFormat)
inputFormat
- the input Instances@OptionMetadata(displayName="minAttDocs", description="Minimum frequency of a sparse attribute to be considered in the attribute space.", commandLineParamName="M", commandLineParamSynopsis="-M <int>", displayOrder=6) public int getMinAttDocs()
public void setMinAttDocs(int minAttDocs)
@OptionMetadata(displayName="minInstDocs", description="Minimum frequency of a word to be considered in the instance space.", commandLineParamName="N", commandLineParamSynopsis="-N <int>", displayOrder=7) public int getMinInstDocs()
public void setMinInstDocs(int minInstDocs)
@OptionMetadata(displayName="createWordAtts", description="True for creating unigram attributes.", commandLineParamIsFlag=true, commandLineParamName="W", commandLineParamSynopsis="-W", displayOrder=8) public boolean isCreateWordAtts()
public void setCreateWordAtts(boolean createWordAtts)
@OptionMetadata(displayName="createClustAtts", description="True for creating attributes using word clusters", commandLineParamIsFlag=true, commandLineParamName="C", commandLineParamSynopsis="-C", displayOrder=9) public void setCreateClustAtts(boolean createClustAtts)
public boolean isCreateClustAtts()
@OptionMetadata(displayName="wordClustFile", description="The file containing the word clusters.", commandLineParamName="H", commandLineParamSynopsis="-H <string>", displayOrder=10) public java.io.File getWordClustFile()
public void setWordClustFile(java.io.File wordClustFile)
@OptionMetadata(displayName="considerNumericAtts", description="True for considering all numeric attributes in the original dataset in the averaged word vectors.", commandLineParamIsFlag=true, commandLineParamName="natt", commandLineParamSynopsis="-natt", displayOrder=11) public boolean isIncludeMetaData()
public void setIncludeMetaData(boolean includeMetaData)
@OptionMetadata(displayName="freqWeights", description="True if the value of each feature is set to its frequency in the tweet. Boolean weights are used otherwise.", commandLineParamIsFlag=true, commandLineParamName="F", commandLineParamSynopsis="-F", displayOrder=12) public boolean isFreqWeights()
public void setFreqWeights(boolean freqWeights)
public static void main(java.lang.String[] args)
args
- should contain arguments to the filter: use -h for help