Filmnächte-Scraping
Bald sind wieder, ganz hipp, Filmnächte am Elbufer. Leider ist auf http://filmnaechte-am-elbufer.de/ keine sofort verwertbare Information zu finden.
require 'htree' # # Saugen # #system("wget -O programm.html 'http://filmnaechte-am-elbufer.de/fn.php?idx=20'") # # Parsen # doc = HTree(File.new('programm.html')).to_rexml events = [] spans = {} doc.each_element('/html/body//table[@style=\'width: 488px\']/tr/td//span') { |span| text = span.text.to_s text.gsub!(/\ /, ' ') spans[span.attributes['class']] = text if text.size > 0 if span.attributes['class'] == 'progTitle' events << spans spans = {'progDay'=>spans['progDay'], 'progTime'=>spans['progTime'], 'progWeek'=>spans['progWeek']} end } # # Ausgabe tabellarisch # column_sizes = Hash.new(0) events.each { |event| event.each { |column,cell| column_sizes[column] = cell.size if cell.size > column_sizes[column] } } events.each { |event| ptsd = event['progTitleSpecialDay'] puts %w(progWeek progDay progTime progTitle).collect { |column| event[column].ljust(column_sizes[column] + 2) }.to_s.strip + (ptsd ? " (#{ptsd.strip})" : "") }