This is a project I am just beginning. The concept is to align and link the headline trend data to other FRED or BLS or MARKET longitudinal data.
This might be on hold for a few weeks, but it’s a start.
CURRENT MYSQL DB DEFINITION:
-- MySQL dump 10.13 Distrib 8.0.39, for Win64 (x86_64)
--
-- Host: localhost Database: news
-- ------------------------------------------------------
-- Server version 8.0.39
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!50503 SET NAMES utf8mb4 */;
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
--
-- Table structure for table `headline_token`
--
DROP TABLE IF EXISTS `headline_token`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `headline_token` (
`headline_token_id` bigint NOT NULL AUTO_INCREMENT,
`rss_id` bigint NOT NULL,
`year` int NOT NULL,
`month` int NOT NULL,
`day` int NOT NULL,
`word` varchar(200) DEFAULT NULL,
`freq` int DEFAULT NULL,
`created_on` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`headline_token_id`),
KEY `idx_headline_token` (`rss_id`,`year`,`month`,`day`,`word`)
) ENGINE=InnoDB AUTO_INCREMENT=64162 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `log`
--
DROP TABLE IF EXISTS `log`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `log` (
`log_id` bigint NOT NULL AUTO_INCREMENT,
`step` varchar(100) NOT NULL,
`log_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`log_id`)
) ENGINE=InnoDB AUTO_INCREMENT=1248 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `rss`
--
DROP TABLE IF EXISTS `rss`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `rss` (
`ID` bigint NOT NULL AUTO_INCREMENT,
`SOURCE` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_ci NOT NULL,
`LINK` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_ci NOT NULL,
`TITLE` varchar(400) CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_ci NOT NULL,
`PUBLISHED` datetime NOT NULL,
PRIMARY KEY (`ID`),
UNIQUE KEY `unique_link` (`LINK`)
) ENGINE=InnoDB AUTO_INCREMENT=5491 DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_unicode_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on 2024-12-09 9:49:20
GRABBING HEADLINES FROM RSS FEEDS:
from __future__ import print_function
import os
import feedparser
import os.path, time
import json
import math
import time
import urllib.parse as pr
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup as BS
from requests import get
from os.path import exists
from socket import socket, AF_INET, SOCK_STREAM
from decimal import Decimal
from datetime import datetime, date, timedelta
from anyascii import anyascii
import mysql.connector
from unidecode import unidecode
usern = "root"
passw = "password"
dbn = "NEWS"
servern = "localhost"
portn = 3306
#track number of records added
record_load = 0
def logrun(step):
cnx = mysql.connector.connect(user=usern, database=dbn, password=passw, host=servern, port=portn)
cur = cnx.cursor()
qry = """
INSERT INTO log
(step)
VALUES
('<<ARG>>')
""".replace("<<ARG>>", step)
cur.execute(qry)
cnx.commit()
cnx.close()
def AlreadySaved(link):
# Connect with the MySQL Server
cnx = mysql.connector.connect(user=usern, database=dbn, password=passw, host=servern, port=portn)
qry = "select ID from RSS where LINK = '" + link + "'"
cur = cnx.cursor(buffered=True)
cur.execute(qry)
cur.fetchall()
rc = cur.rowcount
cnx.close()
if rc > 0:
return True
else:
return False
def SaveRSS(source, title, link, published):
global record_load
record_load += 1
tit = title.replace("'", "''")
cnx = mysql.connector.connect(user=usern, database=dbn, password=passw, host=servern, port=portn)
cur = cnx.cursor()
qry = """
INSERT INTO RSS
(SOURCE,
LINK,
TITLE,
PUBLISHED)
VALUES
(%s,%s,%s,%s)
"""
val = (source, link, tit, published)
cur.execute(qry, val)
cnx.commit()
cnx.close()
def GrabRSS(RssURL, SourceName):
hdrs = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
NewsFeed = feedparser.parse(RssURL)
for na in NewsFeed.entries:
try:
print(na.title)
print(na.link)
print(na.published)
print(na.published_parsed)
except:
logrun("ERROR: GrabRSS from " + SourceName)
continue
if AlreadySaved(na.link.strip().upper()):
continue
print("*************************")
tyr = na.published_parsed[0]
tmn = na.published_parsed[1]
tdy = na.published_parsed[2]
thr = na.published_parsed[3]
tmi = na.published_parsed[4]
tsc = na.published_parsed[5]
ptms = "%s-%s-%s %s:%s:%s" % (tyr, tmn, tdy, thr, tmi, tsc)
SaveRSS(SourceName, unidecode(na.title), na.link.strip().upper(), ptms)
def debugHere():
input("Press enter to continue ...")
def clearConsole():
command = 'clear'
if os.name in ('nt', 'dos'): # If Machine is running on Windows, use cls
command = 'cls'
os.system(command)
def CycleFeeds():
logrun("start: CycleFeeds")
infowars = "https://www.infowars.com/rss.xml"
zh = "https://feeds.feedburner.com/zerohedge/feed"
yahoo = "https://news.yahoo.com/rss/"
bbcworld = "https://feeds.bbci.co.uk/news/world/rss.xml"
bbc = "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"
nyt = "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"
cnbc = "https://www.cnbc.com/id/100727362/device/rss/rss.html"
fox = "https://moxie.foxnews.com/google-publisher/latest.xml"
aljazeera = "https://www.aljazeera.com/xml/rss/all.xml"
cbs = "https://www.cbsnews.com/latest/rss/world"
f24 = "https://www.france24.com/en/rss"
gnews = "https://globalnews.ca/world/feed/"
wtimes = "https://www.washingtontimes.com/rss/headlines/news/world"
sydher = "https://www.smh.com.au/rss/world.xml"
skynews = "https://feeds.skynews.com/feeds/rss/world.xml"
latimes = "https://www.latimes.com/world/rss2.0.xml"
tindia = "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms"
rt = "https://www.rt.com/rss/news/"
sun = "https://www.thesun.co.uk/news/worldnews/feed/"
mirror = "https://www.mirror.co.uk/news/world-news/?service=rss"
vox = "https://www.vox.com/rss/world-politics/index.xml"
wotr = "https://warontherocks.com/feed/"
hot = "https://www.headlinesoftoday.com/feed"
wnera = "https://worldnewsera.com/feed/"
gpress = "https://globalpressjournal.com/feed/"
indep = "https://www.independent.co.uk/news/world/rss"
spiegel = "https://www.spiegel.de/international/index.rss"
guard = "https://www.theguardian.com/world/rss"
GrabRSS(guard, "GUARDIAN")
GrabRSS(spiegel, "DER SPIEGEL")
GrabRSS(indep, "INDEPENDENT")
GrabRSS(gpress, "Global Press Journal")
GrabRSS(wnera, "World News Era")
GrabRSS(hot, "Headlines of Today")
GrabRSS(wotr, "WAR ON THE ROCKS")
GrabRSS(vox, "VOX")
GrabRSS(mirror, "MIRROR")
GrabRSS(sun, "SUN")
GrabRSS(rt, "RT")
GrabRSS(tindia, "Times of India")
GrabRSS(latimes, "LA TIMES")
GrabRSS(skynews, "SKY NEWS")
GrabRSS(sydher, "Sydney Herald")
GrabRSS(wtimes, "WA TIMES")
GrabRSS(gnews, "Global News")
GrabRSS(f24, "FRANCE24")
GrabRSS(cbs, "CBS")
GrabRSS(aljazeera, "ALJAZEERA")
GrabRSS(fox, "FOX")
GrabRSS(cnbc, "CNBC")
GrabRSS(bbcworld, "BBC WORLD")
GrabRSS(infowars, "INFOWARS")
GrabRSS(zh, "ZEROHEDGE")
GrabRSS(yahoo, "YAHOO")
GrabRSS(bbc, "BBC")
GrabRSS(nyt, "NYT")
logrun("stop: CycleFeeds, records: " + str(record_load))
CycleFeeds()
TOKENIZE HEADLINES:
#parse headlines
import re
import mysql.connector
from anyascii import anyascii
#f = open("demofile2.txt", "a")
#f.write("Now the file has more content!")
#f.close()
#open and read the file after the appending:
#f = open("demofile2.txt", "r")
#print(f.read())
usern = "root"
passw = "password"
dbn = "NEWS"
servern = "localhost"
portn = 3306
def GetSql(qry):
# Connect with the MySQL Server
cnx = mysql.connector.connect(user=usern, database=dbn, password=passw, host=servern, port=portn)
cur = cnx.cursor(buffered=True)
cur.execute(qry)
retRes = cur.fetchall()
cnx.close()
return retRes
def SaveToks(rss_id, year, month, day, word, freq):
cnx = mysql.connector.connect(user=usern, database=dbn, password=passw, host=servern, port=portn)
cur = cnx.cursor()
qry = """
INSERT INTO headline_token
(rss_id,
year,
month,
day,
word,
freq)
VALUES
(%s,%s,%s,%s,%s,%s)
"""
val = (rss_id, year, month, day, word, freq)
cur.execute(qry, val)
cnx.commit()
cnx.close()
def allHeadlines():
headln = """
select
r.id,
r.title,
r.published
from rss r
left outer join headline_token ht
on r.id = ht.rss_id
where ht.rss_id is null;
"""
rset = GetSql(headln)
f = open("news_token.txt", "w")
f.write("ID")
f.write('\t')
f.write("YEAR")
f.write('\t')
f.write("MONTH")
f.write('\t')
f.write("DAY")
f.write('\t')
f.write("WORD")
f.write('\t')
f.write("FREQUENCY")
f.write('\n')
cntr = 0
for rec in rset:
cntr += 1
print(str(cntr))
id = rec[0]
tit = rec[1]
pub = rec[2]
parts = str(pub).split()
parts2 = parts[0].split('-')
yr = int(parts2[0])
mn = int(parts2[1])
dy = int(parts2[2])
tit = re.sub(r"(?<=\d),(?=\d)", "", tit)
tit = tit.replace("''","'")
tit = tit.replace("'","")
tit = tit.replace(","," ")
tit = tit.replace(":", " ")
#&+
tit = tit.replace("&", " ")
tit = tit.replace("+", " ")
tit = tit.replace("\"", " ")
tit = tit.replace("(", " ")
tit = tit.replace(")", " ")
tit = tit.replace("|", " ")
tit = tit.replace("?", " ")
tit = tit.replace(";", " ")
tit = tit.replace("...", " ")
tit = tit.upper()
tits = tit.split()
dic = {}
for t in tits:
t2 = t.strip()
if t2 in dic:
dic[t2] += 1
else:
dic[t2] = 1
for k in dic.keys():
f.write(str(id))
f.write('\t')
f.write(str(yr))
f.write('\t')
f.write(str(mn))
f.write('\t')
f.write(str(dy))
f.write('\t')
#f.write(str(k.encode("UTF-8")))
f.write(anyascii(k))
f.write('\t')
f.write(str(dic[k]))
f.write('\n')
SaveToks(id, yr, mn, dy, anyascii(k), dic[k])
f.close()
allHeadlines()
Checkout Results:
select * from
(
select
year,
month,
day,
word,
sum(freq) as frq
from headline_token
where not word in
('TO','IN','OF','THE','FOR','AND','ON',
'AFTER','AS','A','IS','WITH','AT','BY',
'HOW','BE','ABOUT','-','HIS','HER','MORE',
'IT','WILL','HE','HAS', 'SEE', 'THEY', 'SHE',
'LAST','GOES','NEXT', 'SAYS', 'FROM', 'WHAT',
'MY', 'THEY', 'THEM', 'I', 'MOST', 'COULD',
'WHERE', 'BUT', 'AN', 'WE', 'HAVE')
group by
year,
month,
day,
word
order by year asc, month asc, day asc, sum(freq) desc
) t
where t.frq > 8;