In my project I need to calculate some results using specific external web site.
I checked what .NET framework offers me to perform operations on websites
1) System.Web HTTPRequest
http://msdn.microsoft.com/en-us/library/system.web.httprequest.aspx
2) System.Net WebClient
http://msdn.microsoft.com/en-us/library/system.net.webclient(VS.80).aspx
3) System.Windows.Forms .WebBrowser
http://msdn.microsoft.com/en-us/library/system.windows.forms.webbrowser.aspx
All 3 classes you can use, when you need just read html from the specific website.
I used most advanced WebBrowser class from System.Windows.Forms namespace.
It allows performing more advanced operations on the website by accessing its elements with HTML DOM and also work as a JavaScript runtime.
With WebBrowser control I can perfom following operations:
- input data into website controls
- perform click on the buttons
- reading values
My web scraping implementation:
At first main method, to calculate results using external website with help of WebBrowser control:
1: [STAOperationBehavior] 2: 3: public XYZResultDO Calculate(XYZRequestDO xyzRequestDO)
4: {5: Log.DebugFormat("Calculate: {0}", gefahrstoffRequestDO);
6: 7: var result = new XYZResultDO();
8: try
9: { 10: InitBrowser(); 11: LoadPage(PageUrl); 12: 13: if (!ValidatePage())
14: {15: Log.Fatal("online tool is not available or not valid!");
16: throw new ApplicationException("online tool is not available or not valid!");
17: } 18: 19: foreach (XyzDO xyzDO in requestDO.xyzs)
20: { 21: InsertInput(xyzDO); 22: } 23: 24: result = ReadResults();25: Log.DebugFormat("Page Web scraping. Calculated result {0}", result.ToString());
26: 27: return result;
28: }29: catch (Exception e)
30: {31: Log.FatalFormat("Fehler bei Berechnung {0}", e.ToString());
32: return result;
33: }34: finally
35: {36: if(_webBrowser !=null)
37: { 38: _webBrowser.Dispose();39: _webBrowser = null;
40: } 41: } 42: }This way I initialize my browser control1: private void InitBrowser()2: {3: Log.Debug("InitBrowser() InitializeBrowser");4: _webBrowser = new WebBrowser();5:6: _webBrowser.DocumentCompleted += WebBrowser_DocumentCompleted;7: _webBrowser.ProgressChanged += webBrowser_ProgressChanged;8: }Loading page into the browser control:1: private void LoadPage(string uri)2: {3: if (_webBrowser != null)4: {5: Log.DebugFormat("LoadPage() Page wird geladen von {0}", info.FullName);6: _webBrowser.Navigate(url);7: WaitToLoadDocument();8: }9:handle problems by loading1: private void WaitToLoadDocument()2: {3: _browserBusy = true;4: DateTime startSeiteLaden = DateTime.Now;5: Log.DebugFormat("WaitToLoadDocument() Laden von Webseite in Browser.");6: while (_browserBusy && startSeiteLaden.AddSeconds(30) > DateTime.Now)7: {8: Application.DoEvents();9: }10: }11:12: private void WebBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)13: {14: _browserBusy = false;15: Log.DebugFormat("ValidateKolumbusPage() Dokument {0} geladen in Browser.", e.Url);16: }Validate loaded page, to ensure that all required elements are found1: private bool ValidatePage()2: {3: Log.Debug("ValidatePage() Validiere geladenene Page");4: //check if document is loaded into the browser5: if (_webBrowser == null _webBrowser.Document == null)6: {7: Log.Warn("ValidatePage() Browser oder Page fehlt.");8: return false;9: }10:11: //check if all document elements existis12: if (_webBrowser.Document.GetElementById("Ergebnis") == null13: _webBrowser.Document.GetElementById("fehler") == null14: _webBrowser.Document.GetElementById("bild1") == null15: _webBrowser.Document.GetElementById("bild2") == null16: _webBrowser.Document.GetElementById("value") == null17: _webBrowser.Document.GetElementById("neu") == null18: _webBrowser.Document.GetElementById("Absenden") == null)19: {20: Log.Warn("ValidatePage() Page ist nicht valid.");21: return false;22: }23:24: return true;25: }Perfom operations on the website using HTML DOM- Reading element value,- set element value,- click operation1: private string GetPageElementValue(string elementName)2: {3: HtmlElement element = _webBrowser.Document.GetElementById(elementName);4: return element.GetAttribute("value");5:6: }7:8:9:10:11: private void SetPageElementValue(string elementName, string elementValue)12: {13: HtmlElement element = _webBrowser.Document.GetElementById(elementName);14: if (element != null)15: {16: element.SetAttribute("value", elementValue);17: }18: }19:20:21:22: private void ClickPageElement(string elementName)23: {24: var element = _webBrowser.Document.GetElementById(elementName);25: element.InvokeMember("click");26: }

exactly what I was looking for. Awesome. Thanks.
ReplyDeleteHello,
ReplyDeleteWeb scraping is a computer software technique of extracting information from websites. It is a method of pulling information from the seemingly infinite number of locations on the web where it is stored. Thanks a lot...