1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
|
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Twitter, hashtags
from pattern.db import Datasheet, pprint, pd
# This example retrieves tweets containing given keywords from Twitter.
try:
# We'll store tweets in a Datasheet.
# A Datasheet is a table of rows and columns that can be exported as a CSV-file.
# In the first column, we'll store a unique id for each tweet.
# We only want to add the latest tweets, i.e., those we haven't seen yet.
# With an index on the first column we can quickly check if an id already exists.
# The pd() function returns the parent directory of this script + any given path.
table = Datasheet.load(pd("cool.csv"))
index = set(table.columns[0])
except:
table = Datasheet()
index = set()
engine = Twitter(language="en")
# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
print i
for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
print
print tweet.text
print tweet.author
print tweet.date
print hashtags(tweet.text) # Keywords in tweets start with a "#".
print
# Only add the tweet to the table if it doesn't already exists.
if len(table) == 0 or tweet.id not in index:
table.append([tweet.id, tweet.text])
index.add(tweet.id)
# Continue mining older tweets in next iteration.
prev = tweet.id
# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))
print "Total results:", len(table)
print
# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further analysis, ...
pprint(table, truncate=100)
# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
|