Amazon : Determine which books are going to sell or not using k-means clustering and Amazon AWS

Download amazon.zip

Synopsis:

amazon_gen.rb
amazon_kmeans.rb


amazon_gen.rb

Synopsis
#!/usr/bin/ruby -w
require 'amazon/aws/search'
include Amazon::AWS
include Amazon::AWS::Search

#--- GLOBALS
#Read this url:  http://raa.ruby-lang.org/project/ruby-aws/
#Assumes that the ~/.amazonrc file contains
#  [global]
#  locale = 'us' 
#  cache = false 
#  key_id = 'your key id'
#  secret_key_id='your secret key'
#  [us]

#CHANGE THIS TO YOUR SELLER ID!!!
seller_id = 'A2DI81WTOSK70H'

#the output file
$my_file = File.new("books.rb", 'w')

#----
#get data about a particular ISBN (asin)
def GetOfferData(msg, id, amount)
	op = ItemLookup.new( 'ASIN', { 'ItemId' => id})
	op_rg = ResponseGroup.new('OfferFull', 'SalesRank')
	req = Request.new()
	req.locale = 'us'
	begin 
	  resp = req.search(op, op_rg)
	rescue
	  $my_file.print "  0,   0.00,         0" 
	  msg = msg + "WARNING: asin lookup failed: ", id, "\n"
	  return 0, msg
  end
	summary = resp.item_lookup_response[0].items[0].item[0].offer_summary
	msg = msg + "Used : " + summary.total_used
	$my_file.printf "%3d, ", summary.total_used.to_i
	if (summary.total_used == '0')
	   msg = msg + " Price: 0 \n"
     $my_file.printf "%6.2f, ", 0.0
	else
     msg = msg + " Price: " + summary.lowest_used_price.formatted_price + "\n"
     $my_file.printf "%6.2f, ", summary.lowest_used_price.amount.to_i / 100.0
  end
  rank = resp.item_lookup_response[0].items[0].item[0].sales_rank
  rank = rank.to_i
  msg = msg + "Rank : " + rank.to_s + "\n"
  rank = 100000000 if (rank == 0)
  $my_file.printf "%9d", rank.to_s

  return msg
end

#---------------- MAIN -------------------------------

##---- Get a page of listing for a seller
page = 1
max_page = 99999
while (true)
  break if page > max_page
  
	op = SellerListingSearch.new( seller_id, {'ListingPage' => page.to_s} ) 
	op_rg = ResponseGroup.new('SellerListing')
	req = Request.new()
	req.locale = 'us'
	resp = req.search(op, op_rg)
	
	pages = resp.seller_listing_search_response[0].seller_listings[0].total_pages.to_i
	print "page: ", page, " of ", pages, "\n"
	max_page = pages if pages < max_page
	
	#puts "-------------------------"
	#puts resp
	#puts "-------------------------"
	#seller_id = resp.seller_listing_search_response[0].seller_listings[0].
	#	    request[0].seller_listing_search_request[0].seller_id
  #print "x=seller_id=", seller_id, "\n"
	
	items = resp.seller_listing_search_response[0].seller_listings[0].seller_listing
	total = items.size
	#puts 'Seller %s is selling the following %s items:' % [ seller_id, total ]
	items.each { |item| 
    msg = ''
		msg = msg + "ASIN : " + item.asin + "\n"
		msg = msg + "list : " + item.listing_id + "\n"
		t = item.title
		t = 'unknown' if t == nil
    $my_file.print "saveit(["
    msg = msg + "Title: " + t + "\n"
    msg = msg + "Price: " + item.price.formatted_price + "\n"
    $my_file.printf "%6.2f, ", item.price.amount.to_i / 100.0
    msg = msg + "Cond : " + item.condition + " " + item.sub_condition + "\n"
		msg = GetOfferData(msg, item.asin, item.price.amount)
		$my_file.print "], 'N', \"", t, "\")\n"
		}
	page = page + 1
end

amazon_kmeans.rb

Synopsis
#! ruby
require "rubygems"
require "ai4r"

#--- GLOBALS
#set the max number of clusters. Why default of 10? None, it's arbitrary and it 
#worked well for my site
$num_clusters = 10

#TO USE:
#  1) Do these steps ONCE:
#     a) run ruby amazon_gen.rb to generate "books.rb"
#     b) rename it to "mybooks.rb"
#  2) run ruby amazon_kmeans.rb > kmeans.out to generate a list of books and clusters
#  4) as you sell books, edit mybooks.rb, changing "N" to "Y" for those books you sold
#  5) re-run ruby amazon_kmeans.rb > kmeans.out
#  6) scan file kmeans.out to find out which cluster number is selling and which are not
#     e.g. in mine cluster 1 and cluster 9 sold heavily, the others only one or two books, or none at all.


#---------------------------
#-- save the data 
$map = {}
$rawdata = []
def saveit(d, sold, title)
  d.push(sold)
  $rawdata.push(d)
  $map[d] = sold + " - " + title
end

#this require must be AFTER function definition of saveit()
require "mybooks.rb"

#---------------- main -------------------------------------------
data_set = Ai4r::Data::DataSet.new(:data_items=> $rawdata)
#generate the clusters for your books.
clusterer = Ai4r::Clusterers::Diana.new.build(data_set, $num_clusters)

#dump the cluster information.
clusterer.clusters.each_index do |i|
	count = 0
	soldcount = 0
  clusterer.clusters[i].data_items.each do |point|
    printf("%3d : ", i+1)
    print point.inspect
    printf(" %-.70s\n", $map[point])
    count = count + 1
    soldcount = soldcount + 1 if point[4] =~ /Y/i
  end
  print "---- Number of titles in cluster ", (i+1), ": ", count, "  #sold=", soldcount, "\n"
end






Contact me about content on this page using john_web-at-arrizza-dot-com
For Web Master or site problems contact: webadmin-at-arrizza-dot-com
Copyright John Arrizza (c) 2001-2010