In my project I need to calculate some results using specific external web site.
I checked what .NET framework offers me to perform operations on websites
1) System.Web HTTPRequest
http://msdn.microsoft.com/en-us/library/system.web.httprequest.aspx
2) System.Net WebClient
http://msdn.microsoft.com/en-us/library/system.net.webclient(VS.80).aspx
3) System.Windows.Forms .WebBrowser
http://msdn.microsoft.com/en-us/library/system.windows.forms.webbrowser.aspx
All 3 classes you can use, when you need just read html from the specific website.
I used most advanced WebBrowser class from System.Windows.Forms namespace.
It allows performing more advanced operations on the website by accessing its elements with HTML DOM and also work as a JavaScript runtime.
With WebBrowser control I can perfom following operations:
- input data into website controls
- perform click on the buttons
- reading values
My web scraping implementation:
At first main method, to calculate results using external website with help of WebBrowser control:
 1:  [STAOperationBehavior]   2:   3: public XYZResultDO Calculate(XYZRequestDO xyzRequestDO)
   4:  {5: Log.DebugFormat("Calculate: {0}", gefahrstoffRequestDO);
   6:   7: var result = new XYZResultDO();
8: try
   9:      {  10:          InitBrowser();  11:          LoadPage(PageUrl);  12:          13: if (!ValidatePage())
  14:          {15: Log.Fatal("online tool is not available or not valid!");
16: throw new ApplicationException("online tool is not available or not valid!");
  17:          }  18:          19: foreach (XyzDO xyzDO in requestDO.xyzs)
  20:          {  21:              InsertInput(xyzDO);  22:          }  23:     24:          result = ReadResults();25: Log.DebugFormat("Page Web scraping. Calculated result {0}", result.ToString());
  26:   27: return result;
  28:      }29: catch (Exception e)
  30:      {31: Log.FatalFormat("Fehler bei Berechnung {0}", e.ToString());
32: return result;
  33:      }34: finally
  35:      {36: if(_webBrowser !=null)
  37:          {  38:              _webBrowser.Dispose();39: _webBrowser = null;
  40:          }   41:      }  42:  }This way I initialize my browser control2: {3: Log.Debug("InitBrowser() InitializeBrowser");4: _webBrowser = new WebBrowser();5:6: _webBrowser.DocumentCompleted += WebBrowser_DocumentCompleted;7: _webBrowser.ProgressChanged += webBrowser_ProgressChanged;8: }Loading page into the browser control:2: {3: if (_webBrowser != null)4: {5: Log.DebugFormat("LoadPage() Page wird geladen von {0}", info.FullName);6: _webBrowser.Navigate(url);7: WaitToLoadDocument();8: }9:handle problems by loading2: {3: _browserBusy = true;4: DateTime startSeiteLaden = DateTime.Now;5: Log.DebugFormat("WaitToLoadDocument() Laden von Webseite in Browser.");6: while (_browserBusy && startSeiteLaden.AddSeconds(30) > DateTime.Now)7: {8: Application.DoEvents();9: }10: }11:12: private void WebBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)13: {14: _browserBusy = false;15: Log.DebugFormat("ValidateKolumbusPage() Dokument {0} geladen in Browser.", e.Url);16: }Validate loaded page, to ensure that all required elements are found2: {3: Log.Debug("ValidatePage() Validiere geladenene Page");4: //check if document is loaded into the browser5: if (_webBrowser == null _webBrowser.Document == null)6: {7: Log.Warn("ValidatePage() Browser oder Page fehlt.");8: return false;9: }10:11: //check if all document elements existis12: if (_webBrowser.Document.GetElementById("Ergebnis") == null13: _webBrowser.Document.GetElementById("fehler") == null14: _webBrowser.Document.GetElementById("bild1") == null15: _webBrowser.Document.GetElementById("bild2") == null16: _webBrowser.Document.GetElementById("value") == null17: _webBrowser.Document.GetElementById("neu") == null18: _webBrowser.Document.GetElementById("Absenden") == null)19: {20: Log.Warn("ValidatePage() Page ist nicht valid.");21: return false;22: }23:24: return true;25: }Perfom operations on the website using HTML DOM- Reading element value,- set element value,- click operation2: {3: HtmlElement element = _webBrowser.Document.GetElementById(elementName);4: return element.GetAttribute("value");5:6: }7:8:9:10:11: private void SetPageElementValue(string elementName, string elementValue)12: {13: HtmlElement element = _webBrowser.Document.GetElementById(elementName);14: if (element != null)15: {16: element.SetAttribute("value", elementValue);17: }18: }19:20:21:22: private void ClickPageElement(string elementName)23: {24: var element = _webBrowser.Document.GetElementById(elementName);25: element.InvokeMember("click");26: }
 

exactly what I was looking for. Awesome. Thanks.
ReplyDeleteHello,
ReplyDeleteWeb scraping is a computer software technique of extracting information from websites. It is a method of pulling information from the seemingly infinite number of locations on the web where it is stored. Thanks a lot...