DVB-Scraping

Aus C3D2
Version vom 18:57, 1. Dez 2008; Wod (Diskussion | Beiträge)

(Unterschied) ← Nächstältere Version | Aktuelle Version (Unterschied) | Nächstjüngere Version → (Unterschied)
Wechseln zu: Navigation, Suche


Inhaltsverzeichnis

Ruby

Telnet-Haltestellenmonitor

b0rk3d

Teile des folgenden Inhalts sind nicht korrekt. Begründung: Die Struktur der WAP-Seiten war nicht permanent und wurde in der Zwischenzeit geändert. Jemand muss auf die neuen URLs anpassen.

#!/usr/bin/env ruby
require 'net/http'
require 'socket'
require 'rexml/document'
class MultipleStations < RuntimeError
  def initialize(stations)
    @stations = stations
  end
  def to_s
    "Mehrere mögliche Haltestellen: " + @stations.join(', ')
  end
end
class StationResult
  def initialize(card)
    strong_n = 0
    card.each_element('p/strong') { |e|
      case strong_n
        when 0
          @name = e.text
        when 1
          @time = e.text
      end
      strong_n += 1
    }
    @trams = []
    card.to_s.scan(/br\/>(\d+:\d+\*?) (.+?)<br\/>-> (.+?)</) { |time,tram,direction|
      @trams << [time, tram, direction]
    }
  end
  def to_s
    column_widths = [4, 5, 4]
    @trams.each { |a|
      a.each_with_index { |b,i|
        column_widths[i] = b.size if b.size > column_widths[i]
      }
    }
    "\n\n#{@name}, #{@time}:\n\n" +
    'Zeit'.ljust(column_widths[0]) + ' | ' +
      'Linie'.ljust(column_widths[1]) + ' | ' +
      'Ziel'.ljust(column_widths[2]) + "\n" +
    ('-' * column_widths[0]) + '-+-' +
      ('-' * column_widths[1]) + '-+-' +
      ('-' * column_widths[2]) + "\n" +
    @trams.collect { |time,tram,direction|
      time.ljust(column_widths[0]) + ' | ' +
        tram.ljust(column_widths[1]) + ' | ' +
        direction.ljust(column_widths[2])
    }.join("\n")
  end
end
class ClientHandler
  def initialize(socket)
    @socket = socket
    puts "#{address} connected"
    Thread.new {
      begin
        handle
      rescue Exception => e
        @socket.puts("Fehler: #{e}")
      ensure
        @socket.close
      end
    }
  end
  def address
    if @socket.peeraddr[0] == 'AF_INET6'
      "[#{@socket.peeraddr[3]}]"
    else
      "#{@socket.peeraddr[3]}"
    end +
    ":#{@socket.peeraddr[1]}"
  end
  def ask_haltestellenmonitor(station)
    param = { :station => station,
              :action => :check,
              :time => Time.new.strftime('%H:%M'),
              :date => Time.new.strftime('%d.%m.%Y')
            }
    param_s = param.collect { |k,v| "#{k}=#{v}" }.join('&')
    param_s.gsub!(/ /, '+')
    res = Net::HTTP.start('wap.dvbag.de') { |http|
      http.get('/wapVVO/wap-rbl.php?' + param_s)
    }
    if res.kind_of? Net::HTTPSuccess
      wml = REXML::Document.new(res.body).root
      card = nil
      wml.each_element('/wml/card') { |c| card = c }
      if card
        if card.attributes['id'] == 'liste'
          stations = []
          card.each_element('p/select/option') { |option|
            stations << option.text
          }
          raise MultipleStations.new(stations)
        elsif card.attributes['id'] == 'result'
          StationResult.new(card).to_s
        else
          raise "Unexpected card/@id: #{card.attributes['id']}"
        end 
      else
        raise "No card found in result document"
      end
    else
      raise "#{res.class}"
    end
  end
  def handle
    @socket.print "Hallo #{address}\n\nHaltestelle: "
    @socket.flush
    haltestelle = @socket.gets
    if haltestelle
      haltestelle.strip!
      puts "#{address} asks for #{haltestelle.inspect}"
      @socket.puts "Anfrage nach #{haltestelle}..."
      @socket.puts ask_haltestellenmonitor(haltestelle)
    end
  end
end
serv = TCPServer.new('0.0.0.0', 65023)
while client = serv.accept
  ClientHandler.new(client)
end

Und dann:

telnet localhost 65023

NCurses Monitor

Der VVO stellt jetzt ja für seine Widgets eine JSON(?)-Variante der Daten zur Verfügung.

require 'net/http'
require 'ncurses'
 
class DvbAbfahrt
  def initialize
    @BASEURI = "http://widgets.vvo-online.de/abfahrtsmonitor/Abfahrten.do?ort=ORT&hst=HST&vz=VZ"
 
  end
 
  def fetch(ort, hst, vz=0)
		# TODO exceptionhandling: timeout
    vz = vz.to_s
		ort = URI.escape ort
		hst = URI.escape hst
    uri = @BASEURI.gsub(/ORT/, ort).gsub(/HST/,hst).gsub(/VZ/, vz)
    res = Net::HTTP.get(URI.parse(uri))
    res = umlauts_hack res 
   	arr = res.scan(/(\d+),([^,]{1,}),(\d+)/) 
  end
 
	def umlauts_hack(s)
		repl = [
			["&quot;", ''],
			["&#252;", "ü"],
			["&#246;", "ö"],
			["&#223;", "ß"]
		]
		repl.each do |r|
			s.gsub!(r[0], r[1])
		end
		return s
	end
 
end
 
class Monitor
 
	def initialize(ort, hst, vz=0) 
		Ncurses::initscr
		Ncurses::start_color
		Ncurses::init_pair(1, Ncurses::COLOR_YELLOW, Ncurses::COLOR_BLACK);
		Ncurses::attron(Ncurses::COLOR_PAIR(1));
		Ncurses::curs_set(0)
		Ncurses::move(0,0)
		Ncurses::printw "Loading..."
		Ncurses::refresh
 
		@sep = "|"
		@timeout = 30	
 
		@wlno = 3
		@wlname = 20 
		@weta = 3
 
		@ort = ort
		@hst = hst
		@vz = vz
		@dvb = DvbAbfahrt.new
		@lines = 0
	end
 
	def print_title
		Ncurses::attron(Ncurses::A_REVERSE | Ncurses::A_BOLD);
		Ncurses::mvprintw(0,0, @hst.center(@wlno+@wlname+@weta+2)) # +2 seperators
		Ncurses::attroff(Ncurses::A_REVERSE | Ncurses::A_BOLD);
	end
 
	def mainloop
		print_title
		while true
			info = @dvb.fetch(@ort, @hst, @vz)
 
		# clearing old lines if needed
 
			if @lines >= info.size
				Ncurses::clear
				print_title
			end
 
			@lines = info.size
 
			info.each_index do |i|
 
				Ncurses::mvprintw(i+1, 0, info[i][0].rjust(@wlno) + @sep + info[i][1][0..@wlname].center(@wlname) + @sep + info[i][2].rjust(@weta))
			end
			Ncurses::refresh
			Ncurses::move(0,0)
			sleep @timeout
		end
	end
end
 
 
 
if __FILE__ == $0
	begin
		if !ARGV.empty?
			m = Monitor.new("Dresden", ARGV[0])
			m.mainloop
		else
			puts "USAGE #{$0} <HALTESTELLENNAME>"
		end
	ensure
		Ncurses::attroff(Ncurses::COLOR_PAIR(1));
		Ncurses::endwin
	end
end

awk (gawk!)

1.) DVB Stationsmonitor, Usage: ./pnv [station] (z.B. ./pnv Bischofsweg)

#!/bin/bash
station=${@:-Mockritzer Strasse}
awk  -vRS='\\],\\[|\r\n\r\n' -vFS='&quot;(,&quot;)*' -vstation="${station/ /%20}" 'BEGIN { s="/inet/tcp/0/widgets.vvo-online.de/80"; print "GET /abfahrtsmonitor/Abfahrten.do?ort=Dresden&hst=" station "&vz=5 HTTP/1.1\r\nHost: widgets.vvo-online.de\r\nAccept: */*\r\nConnection: close\r\n\r\n" |& s; print "Nr. Min. Richtung"; while (s |& getline) { if ($0 ~ /quot/) { printf ("%3s %4s %s\n", $2, $4, $3); } } }' | perl -npe 's/&#(\d*);/($1 & ~0x7f ? chr(0xc0 | (($1 >> 6) & 0x3f)).chr(0x80 | ($1 & 0x3f)) : chr($1))/eg'

Und so sieht es aus:

$ ./pnv
Nr. Min. Richtung
 75    6 Pirnaischer Pl.
 13   10 Mickten
 13   11 Prohlis
 75   12 Leubnitzer Höhe
  9   13 Prohlis
  9   13 Kaditz
  9   14 Kaditz
 75   15 Messe Dresden
 89   15 Löbtau
  9   15 Prohlis

2.) DVB Routenplanung, Usage: ./t [destination [station]] (z.B. `./t "Max Muster Strasse" HBF', `./t HBF' oder `./t')

#!/bin/bash
 
dststation=${1:-My default target}
srcstation=${2:-My default station}
awk -vrequest="http://efa.vvo-online.de:8080/dvb/XSLT_TRIP_REQUEST2?sessionID=0&requestID=0&language=de&usage=xslt_trip&execInst=normal&command=&ptOptionsActive=-1&itOptionsActive=&itDateDay=`date +%d`&itDateMonth=`date +%m`&itDateYear=`date +%y`&place_origin=Dresden&placeState_origin=empty&type_origin=stop&name_origin=${srcstation/ /%20}&nameState_origin=empty&place_destination=Dresden&placeState_destination=empty&type_destination=stop&name_destination=${dststation/ /%20}&nameState_destination=empty&itdTripDateTimeDepArr=dep&itdTimeHour=`date +%H`&idtTimeMinute=`date +%M`" -vRS="<tr[^>]*>|</tr>" -vFS='<td[^>]*>|</td>|\\("|"\\)' 'BEGIN { isDest=0; route=0; curr=0; s="/inet/tcp/0/efa.vvo-online.de/8080"; print "GET " request " HTTP/1.1\r\nHost: efa.vvo-online.de\r\nUser-Agent: akts!zr\r\nAccept: */*\r\nConnection: close\r\n\r\n" |& s; while (s |& getline) { if($0 ~ /option value="[0-9]+:[0-9]+"/) { if($0 ~ /name_destination/) isDest = 1; split($0, a, /<option[^>]*>|<\/option>/); if(isDest) for(pos = 2; a[pos]; pos += 2) dest = dest (dest ? ", " : "") a[pos]; else for(pos = 2; a[pos]; pos += 2) src = src (src ? ", " : "") a[pos]; continue; } if($0 ~ /#ROUTE/) { split($0, a, /<a[^>]*>|<\/a>/); routes[++route]=a[2]; continue; } if($0 ~ /"ROUTE_[0-9]"/) { print routes[++curr];  print "  Zeit  Linie    Station"; continue; } if($2 ~ /^[0-9][0-9]:[0-9][0-9]/) { split($11, a, / /); line=a[2]; printf ("  %s %5s %s\n", $2, line, $4 " " $6); } } if(src || dest) {  print "Gehts etwas genauer?"; if(src) print "Einstieg: " src "?"; if(dest) print "Ausstieg: " dest "?"; } }' | perl -npe 's/([\x80-\xff])/(chr(0xc0 | ((ord($1) >> 6) & 0x3f)).chr(0x80 | (ord($1) & 0x3f)))/eg'

Und so sieht es aus:

$ ./t Terminal Mock
Gehts etwas genauer?
Einstieg: Campingplatz Mockritz, Mockethaler Straße, Mockritz, Mockritzer Straße?
$ ./t Terminal "Mockritzer Str"
1. Fahrt am 01.12.2008 18:40 - 19:44 Uhr
  Zeit  Linie    Station
  18:40    13 ab Dresden Mockritzer Straße
  19:02       an Dresden Bischofsweg
  19:05     7 ab Dresden Bischofsweg
  19:16       an Dresden Infineon Nord
  19:26    77 ab Dresden Infineon Nord
  19:32       an Dresden Flughafen
2. Fahrt am 01.12.2008 19:05 - 19:49 Uhr
  Zeit  Linie    Station
  19:05    75 ab Dresden Mockritzer Straße
  19:08       an Dresden Hp Strehlen
  19:13     S ab Dresden-Strehlen
  19:40       an Dresden Flughafen
  19:48    77 ab Dresden Flughafen
  19:49       an Dresden Flughafen Terminal 1
3. Fahrt am 01.12.2008 19:05 - 19:57 Uhr
...

Python

CLI-Interface

#!/usr/bin/python
 
import sys
 
from urllib import urlencode, urlopen
from optparse import OptionParser
 
from BeautifulSoup import BeautifulStoneSoup
import simplejson
 
widgets_base_url = "http://widgets.vvo-online.de/abfahrtsmonitor/"
 
def get_connections(stop=None, town=None, time=None):
    """
    Get the next connections at *stop* in *town* *time* minutes from now.
    """
    query_params = []
 
    if stop is not None:
        query_params.append(("hst", stop))
    if town is not None:
        query_params.append(("ort", town))
    if time is not None:
        query_params.append(("vz", time))
 
    query_url = widgets_base_url + "Abfahrten.do?" + urlencode(query_params)
 
    page_data = urlopen(query_url).read()
    connections_soup = BeautifulStoneSoup(page_data, convertEntities="html")
    connections_data = connections_soup.contents[0]
 
    connections = simplejson.loads(connections_data)
 
    return connections
 
 
def find_stops(stop, town=None):
    """
    Get stops with the given name in *town*.
    """
    query_params = [("hst", stop)]
 
    if town is not None:
        query_params.append(("ort", town))
 
    query_url = widgets_base_url + "Haltestelle.do?" + urlencode(query_params)
 
    page_data = urlopen(query_url).read()
    stops_soup = BeautifulStoneSoup(page_data, convertEntities="html")
    stops_data = stops_soup.contents[0]
 
    towns, stops = simplejson.loads(stops_data)
 
    return towns, stops
 
 
def format_connections(connections):
    """
    Format a list of connections into a nice table.  Returns a generator for
    table's rows.
    """
 
    destination_column_length = max(23, *(len(d) for _, d, _ in connections))
    line_name_column_length = max(5, *(len(l) for l, _, _ in connections))
    line_format = "%-" + str(line_name_column_length) + "s | %" \
            + str(destination_column_length) + "s | %7s"
    header_line = line_format % ("line", "destination", "arrival")
 
    yield header_line
    yield "-" * line_name_column_length + "-+-" \
            + "-" * destination_column_length + "-+-" \
            + "-" * 7
 
    for line_name, destination, time in connections:
        yield line_format % (line_name, destination, time)
 
 
def print_connections(stop, town, connections, limit=None):
    """
    Print *connections* at *stop* in *town* to stdout.  If a limit is given
    only that many connections are printed otherwise all.
    """
 
    if len(connections) == 0:
        print "No connections at %s in %s." % (stop, town)
        sys.exit(1)
 
    if town is not None:
        print "Next connections at %s in %s:" % (stop, town)
    else:
        print "Next connections at %s:" % (stop,)
 
    print
 
    if limit:
        connections_table = format_connections(connections[:limit])
    else:
        connections_table = format_connections(connections)
 
    for line in connections_table:
        print line
 
 
def main():
    """
    Main function.
    """
 
    option_parser = OptionParser(
            usage="%prog [options] [<town>] <stop>")
    option_parser.add_option("-l", "--limit",
            help="maximum number of connections to display",
            type="int",
            default=5)
    option_parser.add_option("-t", "--time",
            help="minimum time to departure",
            type="int",
            default=None)
    option_parser.add_option("-k", "--no-lookup",
            help="do not look up stop name",
            action="store_false",
            dest="lookup_stop",
            default=True)
 
    options, args = option_parser.parse_args()
 
    # sanitize options
    if options.limit < 0:
        options.limit = None
    if options.time < 0:
        options.time = None
 
    if len(args) == 1:
        stop = args[0]
        town = None
    elif len(args) == 2:
        town, stop = args
    else:
        option_parser.error("Not enough arguments")
 
    if options.lookup_stop:
        towns, stops = find_stops(stop, town)
 
        if len(towns) == 0:
            print "No town named '%s'." % (town,)
            sys.exit(1)
 
        if len(stops) == 0:
            print "No stop named '%s' in the following towns:" % (stop,)
            print "\n".join("    "+t[0] for t in towns)
            sys.exit(1)
 
        for stop_name, town, stop_id in stops:
            connections = get_connections(stop_id, time=options.time)
 
            print_connections(stop_name, town, connections, options.limit)
            print
    else:
        connections = get_connections(stop, town, options.time)
        print_connections(stop, town, connections, options.limit)
 
 
if __name__ == "__main__":
    main()
Speed metal coding 64x64.jpg
Rübÿ Spëëd Mëtäl Cödïng
Coders: Astro | Conny | Sven
Projects: CacaANSICam | Date Determinator | DVB-Scraping | Filmnächte-Scraping | GeeKal | Gruntmaster | Harvester | Hirn | Irb | Jargon-File | Ruby-MediaWiki | Miniwebserver | Momomoto | Pentabarf | Podcast-fetching | Ruby | Ruby-Geekend | Ruby und Ruby on Rails | Sedusa | VDS-Badges | Xmotoctl | Youtube-Scraping
Persönliche Werkzeuge
Namensräume

Varianten
Aktionen
Navigation
Werkzeuge