DVB-Scraping

Aus C3D2
Version vom 1. Dezember 2008, 18:57 Uhr von Wod (Diskussion | Beiträge) (awk (gawk!))

(Unterschied) ← Nächstältere Version | Aktuelle Version (Unterschied) | Nächstjüngere Version → (Unterschied)
Wechseln zu: Navigation, Suche


Ruby

Telnet-Haltestellenmonitor

b0rk3d

Teile des folgenden Inhalts sind nicht korrekt. Begründung: Die Struktur der WAP-Seiten war nicht permanent und wurde in der Zwischenzeit geändert. Jemand muss auf die neuen URLs anpassen.

<source lang="ruby">#!/usr/bin/env ruby require 'net/http' require 'socket' require 'rexml/document' class MultipleStations < RuntimeError

 def initialize(stations)
   @stations = stations
 end
 def to_s
   "Mehrere mögliche Haltestellen: " + @stations.join(', ')
 end

end class StationResult

 def initialize(card)
   strong_n = 0
   card.each_element('p/strong') { |e|
     case strong_n
       when 0
         @name = e.text
       when 1
         @time = e.text
     end
     strong_n += 1
   }
   @trams = []
   card.to_s.scan(/br\/>(\d+:\d+\*?) (.+?)<br\/>-> (.+?)</) { |time,tram,direction|
     @trams << [time, tram, direction]
   }
 end
 def to_s
   column_widths = [4, 5, 4]
   @trams.each { |a|
     a.each_with_index { |b,i|
       column_widths[i] = b.size if b.size > column_widths[i]
     }
   }
   "\n\n#{@name}, #{@time}:\n\n" +
   'Zeit'.ljust(column_widths[0]) + ' | ' +
     'Linie'.ljust(column_widths[1]) + ' | ' +
     'Ziel'.ljust(column_widths[2]) + "\n" +
   ('-' * column_widths[0]) + '-+-' +
     ('-' * column_widths[1]) + '-+-' +
     ('-' * column_widths[2]) + "\n" +
   @trams.collect { |time,tram,direction|
     time.ljust(column_widths[0]) + ' | ' +
       tram.ljust(column_widths[1]) + ' | ' +
       direction.ljust(column_widths[2])
   }.join("\n")
 end

end class ClientHandler

 def initialize(socket)
   @socket = socket
   puts "#{address} connected"
   Thread.new {
     begin
       handle
     rescue Exception => e
       @socket.puts("Fehler: #{e}")
     ensure
       @socket.close
     end
   }
 end
 def address
   if @socket.peeraddr[0] == 'AF_INET6'
     "[#{@socket.peeraddr[3]}]"
   else
     "#{@socket.peeraddr[3]}"
   end +
   ":#{@socket.peeraddr[1]}"
 end
 def ask_haltestellenmonitor(station)
   param = { :station => station,
             :action => :check,
             :time => Time.new.strftime('%H:%M'),
             :date => Time.new.strftime('%d.%m.%Y')
           }
   param_s = param.collect { |k,v| "#{k}=#{v}" }.join('&')
   param_s.gsub!(/ /, '+')
   res = Net::HTTP.start('wap.dvbag.de') { |http|
     http.get('/wapVVO/wap-rbl.php?' + param_s)
   }
   if res.kind_of? Net::HTTPSuccess
     wml = REXML::Document.new(res.body).root
     card = nil
     wml.each_element('/wml/card') { |c| card = c }
     if card
       if card.attributes['id'] == 'liste'
         stations = []
         card.each_element('p/select/option') { |option|
           stations << option.text
         }
         raise MultipleStations.new(stations)
       elsif card.attributes['id'] == 'result'
         StationResult.new(card).to_s
       else
         raise "Unexpected card/@id: #{card.attributes['id']}"
       end 
     else
       raise "No card found in result document"
     end
   else
     raise "#{res.class}"
   end
 end
 def handle
   @socket.print "Hallo #{address}\n\nHaltestelle: "
   @socket.flush
   haltestelle = @socket.gets
   if haltestelle
     haltestelle.strip!
     puts "#{address} asks for #{haltestelle.inspect}"
     @socket.puts "Anfrage nach #{haltestelle}..."
     @socket.puts ask_haltestellenmonitor(haltestelle)
   end
 end

end serv = TCPServer.new('0.0.0.0', 65023) while client = serv.accept

 ClientHandler.new(client)

end </source>

Und dann:

telnet localhost 65023

NCurses Monitor

Der VVO stellt jetzt ja für seine Widgets eine JSON(?)-Variante der Daten zur Verfügung.

<source lang="ruby">

require 'net/http' require 'ncurses'

class DvbAbfahrt

 def initialize
   @BASEURI = "http://widgets.vvo-online.de/abfahrtsmonitor/Abfahrten.do?ort=ORT&hst=HST&vz=VZ"
       
 end
 
 def fetch(ort, hst, vz=0)

# TODO exceptionhandling: timeout

   vz = vz.to_s

ort = URI.escape ort hst = URI.escape hst

   uri = @BASEURI.gsub(/ORT/, ort).gsub(/HST/,hst).gsub(/VZ/, vz)
   res = Net::HTTP.get(URI.parse(uri))
   res = umlauts_hack res 
  	arr = res.scan(/(\d+),([^,]{1,}),(\d+)/) 
 end

def umlauts_hack(s) repl = [ [""", ], ["ü", "ü"], ["ö", "ö"], ["ß", "ß"] ] repl.each do |r| s.gsub!(r[0], r[1]) end return s end

end

class Monitor

def initialize(ort, hst, vz=0) Ncurses::initscr Ncurses::start_color Ncurses::init_pair(1, Ncurses::COLOR_YELLOW, Ncurses::COLOR_BLACK); Ncurses::attron(Ncurses::COLOR_PAIR(1)); Ncurses::curs_set(0) Ncurses::move(0,0) Ncurses::printw "Loading..." Ncurses::refresh

@sep = "|" @timeout = 30

@wlno = 3 @wlname = 20 @weta = 3

@ort = ort @hst = hst @vz = vz @dvb = DvbAbfahrt.new @lines = 0 end

def print_title Ncurses::attron(Ncurses::A_REVERSE | Ncurses::A_BOLD); Ncurses::mvprintw(0,0, @hst.center(@wlno+@wlname+@weta+2)) # +2 seperators Ncurses::attroff(Ncurses::A_REVERSE | Ncurses::A_BOLD); end

def mainloop print_title while true info = @dvb.fetch(@ort, @hst, @vz)

# clearing old lines if needed

if @lines >= info.size Ncurses::clear print_title end

@lines = info.size

info.each_index do |i|

Ncurses::mvprintw(i+1, 0, info[i][0].rjust(@wlno) + @sep + info[i][1][0..@wlname].center(@wlname) + @sep + info[i][2].rjust(@weta)) end Ncurses::refresh Ncurses::move(0,0) sleep @timeout end end end


if __FILE__ == $0 begin if !ARGV.empty? m = Monitor.new("Dresden", ARGV[0]) m.mainloop else puts "USAGE #{$0} <HALTESTELLENNAME>" end ensure Ncurses::attroff(Ncurses::COLOR_PAIR(1)); Ncurses::endwin end end </source>

awk (gawk!)

1.) DVB Stationsmonitor, Usage: ./pnv [station] (z.B. ./pnv Bischofsweg) <source lang="bash">

  1. !/bin/bash

station=${@:-Mockritzer Strasse} awk -vRS='\\],\\[|\r\n\r\n' -vFS='"(,")*' -vstation="${station/ /%20}" 'BEGIN { s="/inet/tcp/0/widgets.vvo-online.de/80"; print "GET /abfahrtsmonitor/Abfahrten.do?ort=Dresden&hst=" station "&vz=5 HTTP/1.1\r\nHost: widgets.vvo-online.de\r\nAccept: */*\r\nConnection: close\r\n\r\n" |& s; print "Nr. Min. Richtung"; while (s |& getline) { if ($0 ~ /quot/) { printf ("%3s %4s %s\n", $2, $4, $3); } } }' | perl -npe 's/&#(\d*);/($1 & ~0x7f ? chr(0xc0 | (($1 >> 6) & 0x3f)).chr(0x80 | ($1 & 0x3f)) : chr($1))/eg' </source>

Und so sieht es aus: <source lang="bash"> $ ./pnv Nr. Min. Richtung

75    6 Pirnaischer Pl.
13   10 Mickten
13   11 Prohlis
75   12 Leubnitzer Höhe
 9   13 Prohlis
 9   13 Kaditz
 9   14 Kaditz
75   15 Messe Dresden
89   15 Löbtau
 9   15 Prohlis

</source>

2.) DVB Routenplanung, Usage: ./t [destination [station]] (z.B. `./t "Max Muster Strasse" HBF', `./t HBF' oder `./t') <source lang="bash">

  1. !/bin/bash

dststation=${1:-My default target} srcstation=${2:-My default station} awk -vrequest="http://efa.vvo-online.de:8080/dvb/XSLT_TRIP_REQUEST2?sessionID=0&requestID=0&language=de&usage=xslt_trip&execInst=normal&command=&ptOptionsActive=-1&itOptionsActive=&itDateDay=`date +%d`&itDateMonth=`date +%m`&itDateYear=`date +%y`&place_origin=Dresden&placeState_origin=empty&type_origin=stop&name_origin=${srcstation/ /%20}&nameState_origin=empty&place_destination=Dresden&placeState_destination=empty&type_destination=stop&name_destination=${dststation/ /%20}&nameState_destination=empty&itdTripDateTimeDepArr=dep&itdTimeHour=`date +%H`&idtTimeMinute=`date +%M`" -vRS="<tr[^>]*>|</tr>" -vFS='<td[^>]*>|</td>|\\("|"\\)' 'BEGIN { isDest=0; route=0; curr=0; s="/inet/tcp/0/efa.vvo-online.de/8080"; print "GET " request " HTTP/1.1\r\nHost: efa.vvo-online.de\r\nUser-Agent: akts!zr\r\nAccept: */*\r\nConnection: close\r\n\r\n" |& s; while (s |& getline) { if($0 ~ /option value="[0-9]+:[0-9]+"/) { if($0 ~ /name_destination/) isDest = 1; split($0, a, /<option[^>]*>|<\/option>/); if(isDest) for(pos = 2; a[pos]; pos += 2) dest = dest (dest ? ", " : "") a[pos]; else for(pos = 2; a[pos]; pos += 2) src = src (src ? ", " : "") a[pos]; continue; } if($0 ~ /#ROUTE/) { split($0, a, /<a[^>]*>|<\/a>/); routes[++route]=a[2]; continue; } if($0 ~ /"ROUTE_[0-9]"/) { print routes[++curr]; print " Zeit Linie Station"; continue; } if($2 ~ /^[0-9][0-9]:[0-9][0-9]/) { split($11, a, / /); line=a[2]; printf ("  %s %5s %s\n", $2, line, $4 " " $6); } } if(src || dest) { print "Gehts etwas genauer?"; if(src) print "Einstieg: " src "?"; if(dest) print "Ausstieg: " dest "?"; } }' | perl -npe 's/([\x80-\xff])/(chr(0xc0 | ((ord($1) >> 6) & 0x3f)).chr(0x80 | (ord($1) & 0x3f)))/eg' </source>

Und so sieht es aus: <source lang="bash"> $ ./t Terminal Mock Gehts etwas genauer? Einstieg: Campingplatz Mockritz, Mockethaler Straße, Mockritz, Mockritzer Straße? $ ./t Terminal "Mockritzer Str" 1. Fahrt am 01.12.2008 18:40 - 19:44 Uhr

 Zeit  Linie    Station
 18:40    13 ab Dresden Mockritzer Straße
 19:02       an Dresden Bischofsweg
 19:05     7 ab Dresden Bischofsweg
 19:16       an Dresden Infineon Nord
 19:26    77 ab Dresden Infineon Nord
 19:32       an Dresden Flughafen

2. Fahrt am 01.12.2008 19:05 - 19:49 Uhr

 Zeit  Linie    Station
 19:05    75 ab Dresden Mockritzer Straße
 19:08       an Dresden Hp Strehlen
 19:13     S ab Dresden-Strehlen
 19:40       an Dresden Flughafen
 19:48    77 ab Dresden Flughafen
 19:49       an Dresden Flughafen Terminal 1

3. Fahrt am 01.12.2008 19:05 - 19:57 Uhr ... </source>

Python

CLI-Interface

<source lang="python">

  1. !/usr/bin/python

import sys

from urllib import urlencode, urlopen from optparse import OptionParser

from BeautifulSoup import BeautifulStoneSoup import simplejson

widgets_base_url = "http://widgets.vvo-online.de/abfahrtsmonitor/"

def get_connections(stop=None, town=None, time=None):

   """
   Get the next connections at *stop* in *town* *time* minutes from now.
   """
   query_params = []
   if stop is not None:
       query_params.append(("hst", stop))
   if town is not None:
       query_params.append(("ort", town))
   if time is not None:
       query_params.append(("vz", time))
   query_url = widgets_base_url + "Abfahrten.do?" + urlencode(query_params)
   page_data = urlopen(query_url).read()
   connections_soup = BeautifulStoneSoup(page_data, convertEntities="html")
   connections_data = connections_soup.contents[0]
   connections = simplejson.loads(connections_data)
   return connections


def find_stops(stop, town=None):

   """
   Get stops with the given name in *town*.
   """
   query_params = [("hst", stop)]
   if town is not None:
       query_params.append(("ort", town))
   query_url = widgets_base_url + "Haltestelle.do?" + urlencode(query_params)
   page_data = urlopen(query_url).read()
   stops_soup = BeautifulStoneSoup(page_data, convertEntities="html")
   stops_data = stops_soup.contents[0]
   towns, stops = simplejson.loads(stops_data)
   return towns, stops


def format_connections(connections):

   """
   Format a list of connections into a nice table.  Returns a generator for
   table's rows.
   """
   destination_column_length = max(23, *(len(d) for _, d, _ in connections))
   line_name_column_length = max(5, *(len(l) for l, _, _ in connections))
   line_format = "%-" + str(line_name_column_length) + "s | %" \
           + str(destination_column_length) + "s | %7s"
   header_line = line_format % ("line", "destination", "arrival")
   yield header_line
   yield "-" * line_name_column_length + "-+-" \
           + "-" * destination_column_length + "-+-" \
           + "-" * 7
   for line_name, destination, time in connections:
       yield line_format % (line_name, destination, time)


def print_connections(stop, town, connections, limit=None):

   """
   Print *connections* at *stop* in *town* to stdout.  If a limit is given
   only that many connections are printed otherwise all.
   """
   if len(connections) == 0:
       print "No connections at %s in %s." % (stop, town)
       sys.exit(1)
   if town is not None:
       print "Next connections at %s in %s:" % (stop, town)
   else:
       print "Next connections at %s:" % (stop,)
   print
   if limit:
       connections_table = format_connections(connections[:limit])
   else:
       connections_table = format_connections(connections)
   for line in connections_table:
       print line


def main():

   """
   Main function.
   """
   option_parser = OptionParser(
           usage="%prog [options] [<town>] <stop>")
   option_parser.add_option("-l", "--limit",
           help="maximum number of connections to display",
           type="int",
           default=5)
   option_parser.add_option("-t", "--time",
           help="minimum time to departure",
           type="int",
           default=None)
   option_parser.add_option("-k", "--no-lookup",
           help="do not look up stop name",
           action="store_false",
           dest="lookup_stop",
           default=True)
   options, args = option_parser.parse_args()
   # sanitize options
   if options.limit < 0:
       options.limit = None
   if options.time < 0:
       options.time = None
   if len(args) == 1:
       stop = args[0]
       town = None
   elif len(args) == 2:
       town, stop = args
   else:
       option_parser.error("Not enough arguments")
   if options.lookup_stop:
       towns, stops = find_stops(stop, town)
       if len(towns) == 0:
           print "No town named '%s'." % (town,)
           sys.exit(1)
       if len(stops) == 0:
           print "No stop named '%s' in the following towns:" % (stop,)
           print "\n".join("    "+t[0] for t in towns)
           sys.exit(1)
       for stop_name, town, stop_id in stops:
           connections = get_connections(stop_id, time=options.time)
           print_connections(stop_name, town, connections, options.limit)
           print
   else:
       connections = get_connections(stop, town, options.time)
       print_connections(stop, town, connections, options.limit)


if __name__ == "__main__":

   main()

</source>

Speed metal coding 64x64.jpg
Rübÿ Spëëd Mëtäl Cödïng
Coders: Astro | Conny | Sven
Projects: CacaANSICam | Date Determinator | DVB-Scraping | Filmnächte-Scraping | GeeKal | Gruntmaster | Harvester | Hirn | Irb | Jargon-File | Ruby-MediaWiki | Miniwebserver | Momomoto | Pentabarf | Podcast-fetching | Ruby | Ruby-Geekend | Ruby und Ruby on Rails | Sedusa | VDS-Badges | Xmotoctl | Youtube-Scraping