#!/usr/bin/ruby

##
#
#   twit.rb - Ruby script to pull geolocated tweets from the Twitter Streaming API
#   Copyright (C) 2010 Aaron Fiske
#
#   Uses Ruby/ProgressBar, copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>,
#   available at http://0xcc.net/ruby-progressbar/ under the Ruby License.
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
##

require 'socket'
require 'progressbar'

max_records = 50 	# default number of records to pull

# get number of records from command line, if present
if ARGV[0].to_i	> 0	#test whether user has passed a value
  max_records = ARGV[0].to_i
  puts "Scanning stream.twitter.com for #{max_records} records."
else
  puts "Usage: twit.rb [NUM_RECORDS].  Defaulting to #{max_records} records."
end

# open logfile
time = Time.new
filename = "tweetlog_#{time.year}-#{time.month}-#{time.day}_#{time.hour}#{time.min}#{time.sec}"
logfile = File.new(filename,"w") 		# create/open a new logfile with write access
puts "Saving results to #{filename}"

#record the start time
start_time = Time.new

host = 'stream.twitter.com'    		# The twitter web server
port = 80                          	# Default HTTP port


# Twitter streaming API method descriptions at:  http://dev.twitter.com/pages/streaming_api_methods#locations

# path = "/1/statuses/sample.json"	# used for straight random sampling
path = "/1/statuses/filter.xml"		# used for filter-based sampling (e.g., geolocation)

locations = "locations=-180,-90,180,90" # location query string to match all geolocated tweets throughout the world

# build HTTP request:

# NOTE: Authentication string built using code at http://en.wikipedia.org/wiki/Basic_access_authentication

# basic random-sample stream request
# request = "GET #{path} HTTP/1.0\r\nHost: localhost\r\nAuthorization: Basic YWZpc2tlOjV5ZWFyMGxk\r\n\r\n"

# geolocated tweets request

request = "POST #{path} HTTP/1.1\r\nHost: localhost\r\n"		# Initial header
request += "Authorization: Basic [INSERT BASE64 ENCODED AUTH HERE]\r\n"	# Basic Auth
request += "Content-Type: application/x-www-form-urlencoded\r\n"	# Tell the server the type of content we want to POST
request += "Content-Length: #{locations.length}\r\n\r\n"		# Tell the server the length of the content
request += "#{locations}\r\n\r\n"					# Pass the content to the server (i.e., the query parameters)

record_count = 0	# keep track of the number of records received
fail_count = 0		# keep track of consecutive failures


# get and process the data, reconnecting responsibly as needed

pbar = ProgressBar.new("Progress", max_records)

while (record_count < max_records)
  socket = TCPSocket.open(host, port)		# Connect to the server
  puts "Connected to #{host} on port #{port}.  Sending request..."
  socket.print(request)				# Send request
  server_response = socket.gets			# Get server response
  if server_response.include? "200 OK"		# Check server response
    puts "Server response OK. Collecting #{max_records-record_count} records."
    logfile = File.open(filename, "a")	# reopen the logfile to append the data
    fail_count = 0				# Reset the consecutive failure count
    while (line = socket.gets) && (record_count < max_records)
      
      if line.include? "<coordin"							# Test if we've reached the geodata portion of the record
        line = socket.gets							# If yes, get the *next* line (it has the lat/lon)
        lat_lon_scan_arr =  line.scan(/<georss:point>(.*)<\/georss:point>/)	# Match the XML data w/regexp (i.e., strip the <georss:point> tags
        
        if lat_lon_scan_arr[0]							# Make sure we got something (i.e., array is not nil)
          
          lat_str, lon_str = lat_lon_scan_arr[0][0].split			# If we did, pull out the lat/lon as strings
          lat = lat_str.to_f						# Convert to floats
          lon = lon_str.to_f
          if ! ((lon == 0.0) && (lat == 0.0))
            logfile.puts "#{record_count} #{Time.new.to_i} #{lat} #{lon}"	# Write the record count, timestamp, and lat/lon to the logfile
          end									# TO DO: pull actual timestamp from tweet (time of receipt is kludge-y)
          $stdout.flush							# flush the output buffer (helps w/logfile monitoring)
          record_count += 1						# increment record counter
          pbar.inc
          
        end
      end
    end
    socket.close					# we're either out of responses or we're done collecting, so close the connection
    logfile.close					# close the logfile
    if (record_count < max_records)
      puts "Connection closed before all records could be downloaded (current position: #{record_count} of #{max_records}).  Reconnecting..."
    end

  else							# server didn't respond OK
    fail_count += 1
    puts "Server response not OK (#{fail_count} attempt(s)).  Sleeping for #{10**fail_count} seconds."		# wait and try again
    sleep(10**fail_count)
  end
end

pbar.finish
end_time = Time.new #record the end time
total_time = (end_time - start_time).to_i
records_per_second = record_count.to_f / total_time


puts "Recorded #{record_count} records in #{total_time} seconds."
puts "  - #{records_per_second} records/second"
#puts "  - #{bytes_per_second} bytes/second recorded"


# Connection logic in Psuedocode:
# wait_time = 10 sec
# fail_count = 0
# while (record_count_not_yet_reached)
#	connect_to_server
#	read_lines
# 	if (no_response)  # need to check HTTP response here
#		fail_count += 1
#		wait for wait_time ** fail_count  # exponential backoff
#	else
#		fail_counnt = 0
#		process_data
#		record_count += 1
#	end
# end