#r "nuget: Plotly.NET, 2.0.0-preview.6"
#r "nuget: Newtonsoft.Json"
#r "nuget: MathNet.Numerics.FSharp, 4.15.0"
open System
Environment.CurrentDirectory <- __SOURCE_DIRECTORY__
fsi.AddPrinter<DateTime>(fun dt -> dt.ToString("s"))
open Newtonsoft.Json
open Plotly.NET
open MathNet.Numerics.LinearAlgebra
type Label =
| Positive
| Negative
| Neutral
type LabeledTranscript =
{ TickerExchange: (string * string)
EarningsCall: string
CumulativeReturn: float
Label: Label }
let readJson (jsonFile: string) =
|> fun json -> JsonConvert.DeserializeObject<array<LabeledTranscript>>(json)
let train, test =
let rnd = System.Random(42)
readJson ("data-cache/LabeledTranscriptsFullSample.json")
|> Seq.sortBy (fun _ -> rnd.Next())
|> Seq.toArray
|> fun xs ->
let cutoff = float xs.Length * 0.8
xs.[.. int cutoff], xs.[int cutoff + 1 ..]
Text data, unstructured data ...
- Change all words in each article to lower case letters
- Expand contractions such as "haven't" to "have not"
- Delete numbers, punctuation, special symbols, and non-English words
- Analyze words as a single root, e.g, "dissapointment" to "dissapoint"
- Porters algorithm
Split each article into a list of words or phrases or nGrams
Original: "The five boxing wizards jump quickly"
- ["The", "five", "boxing", "wizards", "jump", "quickly"]
- ["The five", "five boxing", "boxing wizards", "wizards jump", "jump quickly"]
- Transform each block of text to a vector of word counts
#load "TextPreprocessing.fsx"
open Preprocessing.Normalization
open Preprocessing.Tokenization
open Preprocessing.NltkData
type CallId =
{Ticker: string; Exchange: string}
type WordCount =
{Word: string; Count: int}
type Sentiment =
| Positive
| Negative
type Call =
{ CallId: CallId
WordCount: WordCount []
Signal: float } with
member this.Flag =
if this.Signal > 0. then Positive
else Negative
let preprocessText (text: string) =
// Normalization
|> getOnlyWords
|> expandContractions
// Tokenization
|> nGrams 1
// Stop words removal
|> Seq.choose removeStopWords
let generateCall (xs: LabeledTranscript) =
let callId = {Ticker = fst xs.TickerExchange ; Exchange = snd xs.TickerExchange}
let wordCount =
|> preprocessText
// Bag of words
|> Seq.countBy id
|> Seq.map (fun (word, count) -> {Word=word; Count=count})
|> Seq.toArray
{ CallId = callId
WordCount = wordCount
Signal = xs.CumulativeReturn }
let trainCalls, testCalls =
|> Array.Parallel.map generateCall,
|> Array.Parallel.map generateCall
Topic detection or topic modeling is a technique of automatically extracting meaning from texts
by identifying recurrent themes or topics.
Topic modeling is a method for analyzing large volumes of unlabeld text data. It helps in:
- Discovering hidden topical patterns that are present across the collection
- Annotating documents according to these topics
- Using these annotations to organize, search and summarize texts
A topic consists of a cluster of words that frequently occur together.
This is essentially a clustering problem - we can think of both words and documents as being clustered.
There are many techniques that are used to obtain topic models. One of the commonly used is
Latent Dirichlet Allocation (LDA)
SESTM: A Supervised Sentiment Extraction Algorithm
Feature selection: create a set of sentiment-charged words via predictive (correlation) screening
- Assign prediction/sentiment weights to these words via a supervised topic model (i.e. estimate positive and negative sentiment topics)
- Aggregate terms into an article-level predictive score via penalized likelihood.
Model is motivated by the view that return-predictive content of a given event is
reflected both in the news article text and in the returns of related assets.
Method has an objective of extracting general return predictive content from text.
Simplicity: only requires standard econometric techniques such as correlation analysis and maximum likelihood estimation.
Additionally, unlike other deep learning approaches, the proposed supervised learning approach is entirely "white-box".
- Minimal computing power required.
- Free of any pre-existing sentiment dictionary (polarized words, sentiment lexicons, etc...). No use of ad hoc word-weighting schemes.
Bottom line: A sentiment scoring model is constructed from the joint behaviour of
article text and stock returns.
Theoretical Reusults
- The guarantee of retrieving a sentiment dictionary from training data via correlation screening.
The derivation of sharp error bounds for parameter estimation. The error bounds depend on the scale
of the corpus (e.g., size of the vocabulary, total number of text documents, average number of words
per document, etc.), and the strength of sentiment signals (e.g., the sentivity of returns to sentiment,
sensitivity of text generation to sentiment, etc.).
- The error of predicting the sentiment score of a newly arriving article is both derived and quantified.
Objective: Isolate the subset of sentiment-charged words (remove sentiment-neutral words, i.e. noise).
Intuitively, if a word frequently co-occurs in articles that are accompanied
by positive returns, that word is likely to convey positive sentiment.
Calculate the frequency with which each word (or phrase) j co-occurs with a positive
return. (screening-score \(f_{j}\))
Compare \(f_{j}\) with proper thresholds and create the sentiment-charged set of words \(S\).
Before computing any scores, we need to first find out the frequency and "occurence" of each word or text item in the corpus of documents.
While the frequency of each text item or word is simply its total count across all documents, an item's occurence is equivalent to the total count of documents that include it.
/// Vector of item counts per Group (Flag) (Bag of words per group)
let itemOccurenceByGroup, itemFrequencyByGroup =
|> Seq.groupBy (fun xs -> xs.Flag)
|> Seq.map (fun (group, callsOfGroup) ->
|> Seq.collect (fun xs -> xs.WordCount)
|> Seq.groupBy (fun xs -> xs.Word)
|> Seq.map (fun (wordId, wordCounts) ->
|> fun xs ->
// Occurence (# of articles that word j appears)
(wordId, xs |> Seq.length),
// Frequency (total count of word j in all articles)
(wordId, xs |> Seq.sumBy (fun xs -> xs.Count)))
|> Seq.toArray
|> fun xs ->
(group, xs |> Array.map fst |> Map),
(group, xs |> Array.map snd |> Map))
|> Seq.toArray
|> fun xs ->
(xs |> Array.map fst |> Map),
(xs |> Array.map snd |> Map)
/// Frequency/Occurence finder
let countOfItemInGroup (group: Sentiment)
(wordSentimentMap : Map<Sentiment, Map<string, int>>)
(item: string) =
wordSentimentMap.TryFind group
|> Option.bind (fun xs -> xs.TryFind item)
countOfItemInGroup Positive itemFrequencyByGroup "sales"
countOfItemInGroup Positive itemOccurenceByGroup "sales"
While the frequency of each text item or word is simply its total count across all documents, an item's occurence is equivalent to the total count of documents that include text item j.
We can then define two variants of screening scores:
- Screening score based on total word frequency:
\[f_{j} = \frac{{\text{count of word } j \text{ in articles with } sgn(y) = +1 }}{\text{count of word } j \text{ in all articles}} \]
- Screening score based on word occurence across documents:
\[f_{j}^{*} = \frac{{\text{count of articles including word } j \text{ in articles with } sgn(y) = +1 }}{\text{count of articles including word } j }\]
type CountType =
| Frequency
| Occurence
type TextItemScreening =
{ TextItem: string
Score: float
Count: float
CountType: CountType }
/// Vocabulary (training set only)
let vocabulary =
|> Seq.collect (fun xs -> xs.WordCount |> Array.map (fun xs -> xs.Word))
|> Seq.distinct
|> Seq.toArray
/// Get scores from given word sentiment map (Frequency or Occurence)
let getScores (wordSentimentMap: Map<Sentiment, Map<string, int>>)
(countType: CountType) =
let getItemScore item =
let generateItemScore item score count =
{TextItem = item; Score = score; Count = count; CountType = countType }
let posN, negN =
countOfItemInGroup Positive wordSentimentMap item,
countOfItemInGroup Negative wordSentimentMap item
match posN, negN with
| Some p, Some n ->
let count = float (p + n)
let score = (float p) / count
Some (generateItemScore item score count)
| Some p, None ->
let score, count = 1., float p
Some (generateItemScore item score count)
| None, Some n ->
let score, count = 0., float n
Some (generateItemScore item score count)
| _ -> None
// Compute text item scores
|> Array.Parallel.choose getItemScore
|> Array.map (fun xs -> xs.TextItem, xs)
|> Map
let itemOccurenceScores, itemFrequencyScores =
getScores itemOccurenceByGroup Occurence,
getScores itemFrequencyByGroup Frequency
Histogram: Item scores
|> Map.toArray
|> Array.map (fun (word, xs) -> xs.Score)
|> Array.filter (fun xs -> xs > 0.25 && xs < 0.75)
|> Chart.Histogram
|> Chart.Show
\[\widehat{S} = \{j: f_{j} \geq \widehat{\pi} + \alpha_{+}, \text{ or } f_{j} \leq \widehat{\pi} - \alpha_{-} \} \cap \{ j: k_{j} \geq \kappa\}\]
- \(f_{j} = \text{Sentiment-screening score of word } j\)
- \(\widehat{\pi} = \text{Fraction of articles tagged with a positive return}\)
- \(\alpha_{+} = \text{Upper sentiment-score threshold}\)
- \(\alpha_{-} = \text{Lower sentiment-score threshold}\)
- \(k_{j} = \text{count of word } j \text{ in all articles}\)
The thresholds (\(\alpha{+}, \alpha{-}, \kappa\)) are hyper-parameters that can be tuned via cross-validation.
/// Sentiment-charged words
let getChargedItems alphaLower alphaUpper kappaPct =
// Count of text item in all articles
let kappa = kappaPct * float train.Length
// Upper and lower score thresholds
let upperThresh, lowerThresh =
|> Array.filter (fun xs -> xs.Flag = Positive)
|> fun xs -> float xs.Length / float train.Length
|> fun pieHat -> (pieHat + alphaUpper), (pieHat - alphaLower)
let isCharged item =
match itemFrequencyScores.TryFind item, itemOccurenceScores.TryFind item with
| Some freqScore, Some occScore ->
if ((freqScore.Score >= upperThresh || freqScore.Score <= lowerThresh) && (occScore.Count >= kappa))
then Some item
else None
| _ -> None
|> Array.choose isCharged
let alphaLower, alphaUpper, kappa = (0.0075, 0.0075, 0.5)
let chargedItems = getChargedItems alphaLower alphaUpper kappa
let filterCall (call: Call): Call =
let textItemsFromCall =
|> Array.map (fun xs -> xs.Word, xs)
|> Map
let filteredItemCounts =
|> Array.map (fun chargedWord ->
match textItemsFromCall.TryFind chargedWord with
| Some wordCount -> wordCount
| None -> {Word=chargedWord; Count = 0})
|> Array.sortBy (fun xs -> xs.Word)
{ CallId = call.CallId
WordCount = filteredItemCounts
Signal = call.Signal }
let chargedTrain =
|> Array.Parallel.map filterCall
// Remove "empty document vectors"
|> Array.filter (fun xs -> xs.WordCount |> Array.sumBy (fun xs -> xs.Count) |> fun xs -> xs <> 0)
let getDocumentTermMatrix (calls: Call []) =
|> Array.sortBy (fun xs -> xs.Signal)
|> Array.map (fun xs ->
|> Array.map (fun xs -> double xs.Count))
|> matrix
|> fun xs -> xs.Transpose()
let chargedDocumentTermMatrix = getDocumentTermMatrix chargedTrain
Fitting a two-topic model to the sentiment-charged counts, chargedItemCountsById
Some notation:
\[\text{Consider a collection of } n \text{ articles and a dictionary of } m \text{ words.}\]
\[d_{i} = \text{word or (phrase) counts of the } i^{th} article\]
\[d_{i, j} = \text{ number of times word } j \text{ occurs in article } i\]
\[D = m \times n \text{ document term matrix}; D = [d_{1}, ..., d{n}]\]
\[d_{[S], i} \sim \text{Multinomial} (s_{i}, p_{i}O_{+} + (1 - p_{i})O_{-})\]
\[p_{i} = \text{ article's sentiment score, } p_{i} \in [0,1]\]
\[s_{i} = \text{ total count of sentiment-charged words in article } i\]
\[O_{+} = \text{ positive sentiment topic}\]
\[O_{-} = \text{ negative sentiment topic}\]
\[\mathbb{E}h_{i} = \mathbb{E}\frac{d_{[S], i}}{s_{i}} = p_{i}O_{+} + (1 -p_{i})O_{-}\]
Estimate \[H\] by plugging in \[\widehat{S}\] from screening step:
\[\widehat{h_{i}} = \frac{d_{[\widehat{S}], i}}{\widehat{s}_{i}}\]
\[\widehat{s}_{i} = \sum_{j \in \widehat{S}}{d_{j, i}}\]
Estimate W using the standardized ranks of returns. For each each article \[i\] in the training sample \[i = 1, ..., n\]:
\[\widehat{p}_{i} = \frac{\text{rank of } y_{i} \text{ in } \{y_{l}\}_{l=1}^{n}}{n}\]
\[\widehat{H} = [\widehat{h_{1}}, \widehat{h_{2}},..., \widehat{h_{3}}]\]
\[\widehat{h_{i}} = \frac{d_{[\widehat{S}], i}}{\widehat{s}_{i}} \text{ } \widehat{s}_{i} = \sum_{j \in \widehat{S}}{d_{j, i}}\]
let bigH =
|> fun m ->
|> Array.map (fun itemCounts ->
let sumOfItemCounts =
Array.sum itemCounts
|> Array.map (fun xs -> xs / sumOfItemCounts))
|> matrix
|> fun xs -> xs.Transpose()
\[\widehat{W} = \begin{bmatrix} \widehat{p_{1}} & \widehat{p_{2}} & \cdots & \widehat{p_{n}} \\ 1 - \widehat{p_{1}} & 1 - \widehat{p_{2}} & \cdots & 1 -\widehat{p_{n}} \end{bmatrix}\]
\[\widehat{p}_{i} = \frac{\text{rank of } y_{i} \text{ in } \{y_{l}\}_{l=1}^{n}}{n}\]
let bigW =
let n =
double chargedDocumentTermMatrix.ColumnCount
|> Array.mapi (fun i _ ->
double (i + 1)/ n)
|> fun xs ->
matrix [|xs; xs |> Array.map (fun p -> (1. - p))|]
\[\widehat{O} = [\widehat{h_{1}}, \widehat{h_{2}},\ldots, \widehat{h_{n}}] \widehat{W}^{'} (\widehat{W}\widehat{W}^{'})^{-1}\]
let bigO =
let h, w, w' = bigH, bigW, bigW.Transpose()
let ww' = w.Multiply(w')
|> Matrix.toColArrays
|> Array.map (fun col ->
|> Array.map (fun xs -> if xs < 0. then 0. else xs)
|> fun onlyPositiveVals ->
let norm = Array.sum onlyPositiveVals
|> Array.map (fun xs -> xs / norm))
|> matrix
|> fun m -> m.Transpose()
Estimating \(p\) (sentiment score) for new articles using maximum likelihood estimation:
\[\widehat{p} = \arg\max_{p\in[\,0, 1]\,} \left\{\hat{s}^{-1} \sum_{j \in \widehat{S}}{d_{j}\log \left(p \widehat{O}_{+, j} + (\,1-p)\,\widehat{O}_{-, j}\right) + \lambda \log \left(p\left(1 - p \right)\right) \right\}\]
\(\hat{s}\text{ is the total count of words from } \widehat{S} \text{ in the new article,} \left(d_{j}, \widehat{O}_{+, j}, \widehat{O}_{-, j} \right) \text{ are the } j \text{th entries of the corresponding vectors, and } \lamda\)
For sentiment charged words, their corresponding entries in \(O\) should be different. Otherwise, these words would not represent any sentiment and should be left out of the set of sentiment charged words. Sentiment neutral words are analogous to useless predictors in a linear model.
let bigOArr =
let objF (call: Call) (p: float) (lambda: float) =
let filteredCall = filterCall call
let sHat =
|> fun xs ->
|> Array.sumBy (fun xs -> xs.Count)
|> fun xs -> (1. / float xs)
|> Array.mapi (fun i xs ->
let pos = p * bigOArr.[i].[0]
let neg = (1. - p) * bigOArr.[i].[1]
let d = float xs.Count
d * log (pos + neg))
|> fun expr ->
(sHat *(Array.sum expr))
let computeScore (call: Call) =
[|0. .. 0.01 .. 1.|]
|> Array.map (fun scoreP -> (scoreP, call.Signal), (objF call scoreP 0.00001))
|> Array.maxBy snd
let testHighSignals =
|> Array.take 100
|> Array.map (fun xs ->
let res = computeScore xs
|> Array.sortBy snd
let testLowSignals =
|> Array.sortBy (fun xs -> xs.Signal)
|> Array.take 500
|> Array.averageBy (fun xs ->
let res = computeScore xs |> fst
res |> fst)
DiffSharp demo
#r "nuget: DiffSharp-lite, 1.0.0-preview-987646120"
open DiffSharp
open DiffSharp.Optim
let computeScore' (call: Call) (x: Tensor) =
let filteredCall = filterCall call
let sHat =
|> fun xs ->
|> Array.sumBy (fun xs -> xs.Count)
|> fun xs -> (1. / float xs)
let scoreP = x.[0]
|> Array.mapi (fun i xs ->
let pos = scoreP * bigOArr.[i].[0]
let neg = (1. - scoreP) * bigOArr.[i].[1]
let d = float xs.Count
d * log (pos + neg))
|> Array.sum
|> fun sumExpr ->
(sHat * sumExpr) + (0.001 * log (scoreP * (1. - scoreP)))
|> fun xs -> xs * -1.
let lr, momentum, iters, threshold = 1e-3, 0.5, 1000, 1e-3
let scoreFun = computeScore' negativeArticleTrain
let scorePGuess = dsharp.tensor([0.5])
let scoreFx, params' = optim.sgd(scoreFun, scorePGuess, lr=dsharp.tensor(lr), momentum=dsharp.tensor(momentum), nesterov=true, iters=iters, threshold=threshold)
